示例#1
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
示例#2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
示例#3
0
def preprocess_data():
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        reviews_preprocessor = ReviewsPreprocessor(use_cache=True)
        reviews_preprocessor.full_cycle()
示例#4
0
文件: main.py 项目: swarnamd/yelp
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
示例#5
0
def manual_main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
示例#6
0
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from evaluation.context_top_n_runner import ContextTopNRunner

    print('\n\n************************\n************************\n')
    print('args', args)

    # Cast integer values
    args[Constants.FM_ITERATIONS_FIELD] = \
        int(args[Constants.FM_ITERATIONS_FIELD])
    args[Constants.FM_NUM_FACTORS_FIELD] = \
        int(args[Constants.FM_NUM_FACTORS_FIELD])
    if args[Constants.USE_CONTEXT_FIELD]:
        args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD])
        args[Constants.TOPIC_MODEL_PASSES_FIELD] = \
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD])
        args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD])

    Constants.update_properties(args)

    # Finish updating parameters

    my_context_top_n_runner = ContextTopNRunner()
    results = my_context_top_n_runner.run()
    results['loss'] = -results[Constants.EVALUATION_METRIC]
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
示例#7
0
def create_topic_models():
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        reviews_preprocessor = ReviewsPreprocessor(use_cache=True)
        reviews_preprocessor.full_cycle()
示例#8
0
文件: main.py 项目: swarnamd/yelp
def calculate_topic_stability(records):

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = []

    context_extractor =\
        topic_model_creator.create_topic_model(records, None, None)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = 0.8

    print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS)
    for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1):
        sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return calculate_stability(all_term_rankings)
示例#9
0
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from evaluation.context_top_n_runner import ContextTopNRunner

    print('\n\n************************\n************************\n')
    print('args', args)

    # Cast integer values
    args[Constants.FM_ITERATIONS_FIELD] = \
        int(args[Constants.FM_ITERATIONS_FIELD])
    args[Constants.FM_NUM_FACTORS_FIELD] = \
        int(args[Constants.FM_NUM_FACTORS_FIELD])
    if args[Constants.USE_CONTEXT_FIELD]:
        args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD])
        args[Constants.TOPIC_MODEL_PASSES_FIELD] = \
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD])
        args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD])

    Constants.update_properties(args)

    # Finish updating parameters

    my_context_top_n_runner = ContextTopNRunner()
    results = my_context_top_n_runner.run()
    results['loss'] = -results[Constants.EVALUATION_METRIC]
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
示例#10
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
示例#11
0
def manual_main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
def full_cycle():

    num_topics_list = [5, 10, 20, 40]
    # bow_type_list = [None, 'NN', 'JJ', 'VB']
    review_type_list = ['specific', 'generic']
    # num_topics_list = [10]
    bow_type_list = ['NN']
    results = []

    for num_topics, bow_type, review_type in itertools.product(
            num_topics_list, bow_type_list, review_type_list):

        Constants.update_properties({
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.BOW_TYPE_FIELD: bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type
        })

        result = analyze_topics()
        result.update({
            Constants.BOW_TYPE_FIELD: bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type
        })
        results.append(result)

    for result in results:
        print(result)

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \
        '_topic_model_context_richness'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
示例#13
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
示例#14
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None, None,
        False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
示例#15
0
    def run_single_fold(self, parameters):

        fold = parameters['fold']

        Constants.update_properties(parameters)

        Constants.print_properties()

        utilities.plant_seeds()
        self.load()

        records = self.original_records

        # self.plant_seeds()
        total_cycle_time = 0.0
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        split = 1 - (1 / float(num_folds))
        self.records = copy.deepcopy(records)
        if Constants.SHUFFLE_DATA:
            self.shuffle(self.records)

        fold_start = time.time()
        cv_start = float(fold) / num_folds
        print('\nFold: %d/%d' % ((fold + 1), num_folds))

        self.create_tmp_file_names(0, fold)
        self.train_records, self.test_records = \
            ETLUtils.split_train_test_copy(
                self.records, split=split, start=cv_start)
        # subsample_size = int(len(self.train_records)*0.5)
        # self.train_records = self.train_records[:subsample_size]
        self.get_records_to_predict(True)
        if Constants.USE_CONTEXT:
            if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
                self.load_cache_context_topics(None, None)
            else:
                context_extractor = self.train_topic_model(0, fold)
                self.find_reviews_topics(context_extractor, 0, fold)
        else:
            self.context_rich_topics = []
        self.predict()
        metrics = self.evaluate()

        fold_end = time.time()
        fold_time = fold_end - fold_start
        total_cycle_time += fold_time
        self.clear()
        print("Total fold %d time = %f seconds" % ((fold + 1), fold_time))

        return metrics
def run_tests():

    combined_parameters = parameter_combinator.get_combined_parameters()

    test_cycle = 1
    num_tests = len(combined_parameters)
    for properties in combined_parameters:
        Constants.update_properties(properties)
        context_top_n_runner = WordContextTopNRunner()

        print('\n\n******************\nTest %d/%d\n******************\n' %
              (test_cycle, num_tests))

        context_top_n_runner.perform_cross_validation()
        test_cycle += 1
示例#17
0
def run_tests():

    combined_parameters = parameter_combinator.hotel_context_parameters()

    test_cycle = 1
    num_tests = len(combined_parameters)
    for properties in combined_parameters:
        Constants.update_properties(properties)
        context_top_n_runner = ContextTopNRunner()

        print('\n\n******************\nTest %d/%d\n******************\n' %
              (test_cycle, num_tests))

        context_top_n_runner.perform_cross_validation()
        test_cycle += 1
示例#18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')

    args = parser.parse_args()
    fold = args.fold[0]

    new_properties = {
        Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold,
        Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate'
    }

    Constants.update_properties(new_properties)

    context_top_n_runner.run_tests()
示例#19
0
文件: main.py 项目: swarnamd/yelp
def topic_stability_main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    # num_topic_list = range(2, 101)
    num_topic_list = [2, 5]
    results = {}
    for num_topics in num_topic_list:
        new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}
        Constants.update_properties(new_properties)
        results[num_topics] = calculate_topic_stability(records)

    print('Results:')
    for num_topics in num_topic_list:
        scores = results[num_topics]
        print('%d: %.4f [%.4f,%.4f]' %
              (num_topics, numpy.nanmean(scores), numpy.nanmin(scores),
               numpy.nanmax(scores)))
示例#20
0
def create_all_term_rankings(records, metric):
    print('%s: creating all term rankings' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    all_term_rankings = []

    # context_extractor =\
    #     topic_model_creator.create_topic_model(records, None, None)
    # terms_matrix = get_topic_model_terms(
    #     context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    # all_term_rankings.append(terms_matrix)

    context_extractor = \
        topic_model_creator.train_context_extractor(records, False)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO

    if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]:
        sample_ratio = None
        Constants.update_properties(
            {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio})
        msg = 'Warning: Since the metric is \'%s\' I have updated the ' \
              'topic_model_stability_sample_ratio value to None' % metric
        print(msg)

    num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    for i in range(num_iterations - 1):
        print('Iteration %d/%d' % (i + 1, num_iterations))
        print('sample_ratio:', sample_ratio)

        if sample_ratio is None:
            sampled_records = records
        else:
            sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records, False)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return all_term_rankings
示例#21
0
def create_all_term_rankings(records, metric):
    print('%s: creating all term rankings' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    all_term_rankings = []

    # context_extractor =\
    #     topic_model_creator.create_topic_model(records, None, None)
    # terms_matrix = get_topic_model_terms(
    #     context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    # all_term_rankings.append(terms_matrix)

    context_extractor = \
        topic_model_creator.train_context_extractor(records, False)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO

    if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]:
        sample_ratio = None
        Constants.update_properties(
            {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio})
        msg = 'Warning: Since the metric is \'%s\' I have updated the ' \
              'topic_model_stability_sample_ratio value to None' % metric
        print(msg)

    num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    for i in range(num_iterations - 1):
        print('Iteration %d/%d' % (i+1, num_iterations))
        print('sample_ratio:', sample_ratio)

        if sample_ratio is None:
            sampled_records = records
        else:
            sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records, False)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return all_term_rankings
示例#22
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(document_term_matrix,
                                          document_topic_matrix,
                                          topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
示例#23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--fold',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The index of the cross validation fold')

    args = parser.parse_args()
    fold = args.fold[0]

    new_properties = {
        Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold,
        Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate'
    }

    Constants.update_properties(new_properties)

    context_top_n_runner.run_tests()
示例#24
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(
            document_term_matrix, document_topic_matrix, topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
示例#25
0
def create_topic_model(num_topics):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10,
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics
    })
    utilities.plant_seeds()
    Constants.print_properties()

    file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

    if os.path.exists(file_path):
        print('Ensemble topic model already exists')
        return

    # topic_ensemble_caller.run_local_parse_directory()
    topic_ensemble_caller.run_generate_kfold()
    topic_ensemble_caller.run_combine_nmf()
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from topicmodeling.context import topic_model_analyzer

    print('\n\n************************\n************************\n')
    print('args', args)

    parameters = {
        Constants.BUSINESS_TYPE_FIELD:
        args[Constants.BUSINESS_TYPE_FIELD],
        # 'lda_alpha': args['lda_alpha'],
        # 'lda_beta': args['lda_beta'],
        Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD:
        args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD],
        Constants.TOPIC_MODEL_ITERATIONS_FIELD:
        int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]),
        Constants.TOPIC_MODEL_PASSES_FIELD:
        int(args[Constants.TOPIC_MODEL_PASSES_FIELD]),
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
        int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]),
        # 'topic_weighting_method': args['topic_weighting_method'],
        Constants.USE_CONTEXT_FIELD:
        args[Constants.USE_CONTEXT_FIELD]
    }

    Constants.update_properties(parameters)
    # Finish updating parameters

    results = topic_model_analyzer.export_topics()
    results['loss'] = -results['combined_score']
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
示例#27
0
文件: main.py 项目: swarnamd/yelp
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
示例#28
0
def run_tests():

    combined_parameters = parameter_combinator.get_combined_parameters()

    test_cycle = 1
    num_tests = len(combined_parameters)
    highest_value = -1
    best_parameters = None
    for properties in combined_parameters:
        Constants.update_properties(properties)
        context_top_n_runner = ContextTopNRunner()

        print('\n\n******************\nTest %d/%d\n******************\n' %
              (test_cycle, num_tests))

        results = context_top_n_runner.run()
        if results[Constants.EVALUATION_METRIC] > highest_value:
            highest_value = results[Constants.EVALUATION_METRIC]
            best_parameters = properties
        test_cycle += 1

    print('highest %s: %f' % (Constants.EVALUATION_METRIC, highest_value))
    print(best_parameters)
示例#29
0
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
示例#30
0
def create_topic_model(num_topics):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10,
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
        num_topics
    })
    utilities.plant_seeds()
    Constants.print_properties()

    file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

    if os.path.exists(file_path):
        print('Ensemble topic model already exists')
        return

    # topic_ensemble_caller.run_local_parse_directory()
    topic_ensemble_caller.run_generate_kfold()
    topic_ensemble_caller.run_combine_nmf()
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from topicmodeling.context import topic_model_analyzer

    print('\n\n************************\n************************\n')
    print('args', args)

    parameters = {
        Constants.BUSINESS_TYPE_FIELD: args[Constants.BUSINESS_TYPE_FIELD],
        # 'lda_alpha': args['lda_alpha'],
        # 'lda_beta': args['lda_beta'],
        Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD:
            args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD],
        Constants.TOPIC_MODEL_ITERATIONS_FIELD:
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]),
        Constants.TOPIC_MODEL_PASSES_FIELD:
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD]),
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]),
        # 'topic_weighting_method': args['topic_weighting_method'],
        Constants.USE_CONTEXT_FIELD: args[Constants.USE_CONTEXT_FIELD]
    }

    Constants.update_properties(parameters)
    # Finish updating parameters

    results = topic_model_analyzer.export_topics()
    results['loss'] = -results['combined_score']
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
示例#32
0
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError(
            'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
示例#33
0
def full_cycle():

    num_topics_list = [5, 10, 20, 40]
    # bow_type_list = [None, 'NN', 'JJ', 'VB']
    review_type_list = ['specific', 'generic']
    # num_topics_list = [10]
    bow_type_list = ['NN']
    results = []

    for num_topics, bow_type, review_type in itertools.product(
            num_topics_list, bow_type_list, review_type_list):

        Constants.update_properties({
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
            num_topics,
            Constants.BOW_TYPE_FIELD:
            bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD:
            review_type
        })

        result = analyze_topics()
        result.update({
            Constants.BOW_TYPE_FIELD: bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type
        })
        results.append(result)

    for result in results:
        print(result)

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \
        '_topic_model_context_richness'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
示例#34
0
def export_topics(cycle_index, fold_index, epsilon=None, alpha=None):

    topic_model_creator.plant_seeds()

    new_properties = copy.deepcopy(Constants._properties)
    if epsilon is not None:
        new_properties['lda_epsilon'] = epsilon
    if alpha is not None:
        new_properties['lda_alpha'] = alpha

    Constants.update_properties(new_properties)

    lda_based_context = load_topic_model(cycle_index, fold_index)

    file_name = Constants.DATASET_FOLDER + 'all_reviews_topic_model_' + \
        Constants.ITEM_TYPE + '_' + \
        str(Constants.LDA_NUM_TOPICS) + '_' + \
        str(Constants.LDA_MODEL_PASSES) + '_' + \
        str(Constants.LDA_MODEL_ITERATIONS) + '_' + \
        str(Constants.LDA_EPSILON) + \
        '-nouns-complete.csv'
    print(file_name)

    num_words = 10
    headers = [
        'topic_id',
        'ratio',
        'score',
        # 'words_ratio',
        # 'past_verbs_ratio',
        # 'frq',
        # 'specific_frq',
        # 'generic_frq',
        # 'log_words',
        # 'specific_log_words',
        # 'generic_log_words',
        # 'log_past_verbs',
        # 'specific_log_past_verbs',
        # 'generic_log_past_verbs'
    ]

    for i in range(num_words):
        headers.append('word' + str(i))

    results = []

    topic_statistics_map = lda_based_context.topic_statistics_map
    topic_ratio_map = lda_based_context.topic_ratio_map

    num_reviews = len(lda_based_context.records)
    num_specific_reviews = len(lda_based_context.specific_reviews)
    num_generic_reviews = len(lda_based_context.generic_reviews)
    print('num reviews: %d' % num_reviews)
    print('num specific reviews: %d' % num_specific_reviews)
    print('num generic reviews: %d' % num_generic_reviews)
    print('specific reviews percentage : %f %%' % (float(num_specific_reviews) / num_reviews * 100))
    print('generic reviews percentage : %f %%' % (float(num_generic_reviews) / num_reviews * 100))
    print('number of contextual topics: %d' % len(lda_based_context.context_rich_topics))

    for topic in topic_ratio_map.keys():
        result = {}
        result['topic_id'] = topic
        result['ratio'] = topic_ratio_map[topic]
        result.update(split_topic(
            lda_based_context.topic_model.print_topic(topic, topn=num_words)))
        results.append(result)

    # for topic in topic_statistics_map.keys():
    #
    #     # pri
    #
    #     result = {}
    #     result['topic_id'] = topic
    #     result['ratio'] = topic_statistics_map[topic]['frequency_ratio']
    #     result['words_ratio'] = topic_statistics_map[topic]['words_ratio']
    #     result['past_verbs_ratio'] = topic_statistics_map[topic]['past_verbs_ratio']
    #     result['frq'] = topic_statistics_map[topic]['weighted_frq']['review_frequency']
    #     result['specific_frq'] = topic_statistics_map[topic]['specific_weighted_frq']['review_frequency']
    #     result['generic_frq'] = topic_statistics_map[topic]['generic_weighted_frq']['review_frequency']
    #     result['log_words'] = topic_statistics_map[topic]['weighted_frq']['log_words_frequency']
    #     result['specific_log_words'] = topic_statistics_map[topic]['specific_weighted_frq']['log_words_frequency']
    #     result['generic_log_words'] = topic_statistics_map[topic]['generic_weighted_frq']['log_words_frequency']
    #     result['log_past_verbs'] = topic_statistics_map[topic]['weighted_frq']['log_past_verbs_frequency']
    #     result['specific_log_past_verbs'] = topic_statistics_map[topic]['specific_weighted_frq']['log_past_verbs_frequency']
    #     result['generic_log_past_verbs'] = topic_statistics_map[topic]['generic_weighted_frq']['log_past_verbs_frequency']
    #     result.update(split_topic(lda_based_context.topic_model.print_topic(topic, topn=num_words)))
    #
    #     # print(lda_based_context.topic_model.print_topic(topic, topn=num_words))
    #     results.append(result)
    analyze_topics(results, lda_based_context)
示例#35
0
def main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
示例#36
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-k',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')
    parser.add_argument('-i',
                        '--itemtype',
                        metavar='string',
                        type=str,
                        nargs=1,
                        help='The type of items')
    parser.add_argument(
        '-s',
        '--strategy',
        metavar='string',
        type=str,
        nargs=1,
        help='The evaluation strategy (user_test or rel_plus_n)')
    parser.add_argument('-e',
                        '--evaluationset',
                        metavar='string',
                        type=str,
                        nargs=1,
                        help='The evaluation set')
    parser.add_argument(
        '-cf',
        '--contextformat',
        metavar='string',
        type=str,
        nargs=1,
        help='The strategy to extract the contextual information')
    parser.add_argument('-a',
                        '--algorithm',
                        metavar='string',
                        type=str,
                        nargs=1,
                        help='The algorithm used to produce recommendations')
    parser.add_argument('-cp',
                        '--carskitparams',
                        metavar='string',
                        type=str,
                        nargs=1,
                        help='The hyperparameters for the CARSKit model')
    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None
    item_type = args.itemtype[0] if args.itemtype is not None else None
    strategy =\
        args.strategy[0] if args.strategy is not None else None
    evaluation_set =\
        args.evaluationset[0] if args.evaluationset is not None else None
    context_format =\
        args.contextformat[0] if args.contextformat is not None else None
    algorithm =\
        args.algorithm[0] if args.algorithm is not None else 'libfm'
    carskit_params =\
        args.carskitparams[0] if args.carskitparams is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})
    if item_type is not None:
        Constants.update_properties({Constants.BUSINESS_TYPE_FIELD: item_type})
    if context_format is not None:
        Constants.update_properties(
            {Constants.CONTEXT_FORMAT_FIELD: context_format})
    if strategy is not None:
        Constants.update_properties(
            {Constants.RIVAL_EVALUATION_STRATEGY_FIELD: strategy})
    if carskit_params is not None:
        Constants.update_properties(
            {Constants.CARSKIT_PARAMETERS_FIELD: carskit_params})
    if algorithm.startswith('carskit_'):
        carskit_recommender = algorithm.split('carskit_')[1]
        Constants.update_properties(
            {'carskit_recommenders': carskit_recommender})
        full_cycle_carskit(evaluation_set)
    elif algorithm == 'libfm':
        full_cycle_libfm(evaluation_set)
    else:
        raise ValueError('Unknown algorithm \'%s\'' % algorithm)
示例#37
0
def main():

    # modify_properties_file()
    # run_carskit()

    if Constants.CARSKIT_ITEM_RANKING:
        all_recommenders = [
            'globalavg', 'useravg', 'itemavg', 'useritemavg',
            'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf',
            'slim',  # 'bpr',  # 'rankals', 'ranksgd',
            'bpr',
            'lrmf',
            'camf_ci', 'camf_cu',  # 'camf_c',
            'camf_cuci', 'cslim_c', 'cslim_ci',
            'cslim_cu',  # 'cslim_cuci',
            # 'camf_ics',
            ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs',
            ##'cslim_mcs', 'gcslim_ics'
        ]
    else:
        all_recommenders = [
            'globalavg', 'useravg', 'itemavg', 'useritemavg',
            'slopeone', 'pmf', 'biasedmf', 'nmf',
            'camf_ci', 'camf_cu',  # 'camf_c',
            'camf_cuci',
            'bpmf',
        ]

    slow_recommenders = [
        # 'contextavg', 'itemcontextavg', 'usercontextavg',  # broken without context
        ## 'itemknn',
        ## 'userknn',
        ## 'cptf',
        ## 'gcslim_cc',
        ## 'gcslim_lcs',
        ## 'gcslim_mcs',
        ## 'fm'
    ]

    rating_recommenders = [
        'globalavg', 'useravg', 'itemavg', 'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn', 'userknn', 'slopeone', 'pmf', 'biasedmf', 'nmf',
        'cptf',
        'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci',
        'camf_ics', 'camf_lcs', 'camf_mcs',
        'fm'
    ]

    rank_recommenders = [
        'globalavg', 'useravg', 'itemavg', 'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn', 'userknn', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf',
        'slim', 'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cptf',
        'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        # 'camf_ics',
        'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs',
        'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs',
        'fm'
    ]

    rank_only_recommenders = [
        'slim', 'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cslim_c', 'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        'cslim_ics', 'cslim_lcs',
        'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs'
    ]

    recommenders = all_recommenders

    print('num recommenders: %d' % len(recommenders))
    index = 1

    for recommender in recommenders:
        print('cycle %d/%d' % (index, len(recommenders)))
        print('Recommender: %s' % recommender)
        Constants.update_properties({'carskit_recommenders': recommender})
        index += 1

        cycle_start = time.time()
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        for fold in range(num_folds):
            modify_properties_file(fold)
            full_cycle(fold)
        cycle_end = time.time()
        cycle_time = cycle_end - cycle_start
        print("Cycle time = %f seconds" % cycle_time)
示例#38
0
def old_main():

    # modify_properties_file()
    # run_carskit()

    # baseline-Avg recommender: GlobalAvg, UserAvg, ItemAvg, UserItemAvg
    # baseline-Context average recommender: ContextAvg, ItemContextAvg, UserContextAvg
    # baseline-CF recommender: ItemKNN, UserKNN, SlopeOne, PMF, BPMF, BiasedMF, NMF, SVD++
    # baseline-Top-N ranking recommender: SLIM, BPR, RankALS, RankSGD, LRMF
    # CARS - splitting approaches: UserSplitting, ItemSplitting, UISplitting; algorithm options: e.g., usersplitting -traditional biasedmf -minlenu 2 -minleni 2
    # CARS - filtering approaches: SPF, DCR, DCW
    # CARS - independent models: CPTF
    # CARS - dependent-dev models: CAMF_CI, CAMF_CU, CAMF_C, CAMF_CUCI, CSLIM_C, CSLIM_CI, CSLIM_CU, CSLIM_CUCI, GCSLIM_CC
    # CARS - dependent-sim models: CAMF_ICS, CAMF_LCS, CAMF_LCS, CSLIM_ICS, CSLIM_LCS, CSLIM_MCS, GCSLIM_ICS, GCSLIM_LCS, GCSLIM_MCS

    if Constants.CARSKIT_ITEM_RANKING:
        all_recommenders = [
            'globalavg',
            'useravg',
            'itemavg',
            'useritemavg',
            'slopeone',
            'pmf',
            'bpmf',
            'biasedmf',
            'nmf',
            'slim',  # 'bpr',  # 'rankals', 'ranksgd',
            'bpr',
            'lrmf',
            'camf_ci',
            'camf_cu',  # 'camf_c',
            'camf_cuci',
            'cslim_c',
            'cslim_ci',
            'cslim_cu',  # 'cslim_cuci',
            # 'camf_ics',
            ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs',
            ##'cslim_mcs', 'gcslim_ics'
        ]
    else:
        all_recommenders = [
            'globalavg',
            'useravg',
            'itemavg',
            'useritemavg',
            'slopeone',
            'pmf',
            'biasedmf',
            'nmf',
            'camf_ci',
            'camf_cu',  # 'camf_c',
            'camf_cuci',
            'bpmf',
        ]

    slow_recommenders = [
        # 'contextavg', 'itemcontextavg', 'usercontextavg',  # broken without context
        ## 'itemknn',
        ## 'userknn',
        ## 'cptf',
        ## 'gcslim_cc',
        ## 'gcslim_lcs',
        ## 'gcslim_mcs',
        ## 'fm'
    ]

    rating_recommenders = [
        'globalavg',
        'useravg',
        'itemavg',
        'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn',
        'userknn',
        'slopeone',
        'pmf',
        'biasedmf',
        'nmf',
        'cptf',
        'camf_ci',
        'camf_cu',
        'camf_c',
        'camf_cuci',
        'camf_ics',
        'camf_lcs',
        'camf_mcs',
        'fm'
    ]

    rank_recommenders = [
        'globalavg',
        'useravg',
        'itemavg',
        'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn',
        'userknn',
        'slopeone',
        'pmf',
        'bpmf',
        'biasedmf',
        'nmf',
        'slim',
        'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cptf',
        'camf_ci',
        'camf_cu',
        'camf_c',
        'camf_cuci',
        'cslim_c',
        'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        # 'camf_ics',
        'camf_lcs',
        'camf_mcs',
        'cslim_ics',
        'cslim_lcs',
        'cslim_mcs',
        'gcslim_ics',
        'gcslim_lcs',
        'gcslim_mcs',
        'fm'
    ]

    rank_only_recommenders = [
        'slim',
        'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cslim_c',
        'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        'cslim_ics',
        'cslim_lcs',
        'cslim_mcs',
        'gcslim_ics',
        'gcslim_lcs',
        'gcslim_mcs'
    ]

    # recommenders = all_recommenders
    recommenders = ['CAMF_CU']

    print('num recommenders: %d' % len(recommenders))
    index = 1

    for recommender in recommenders:
        print('cycle %d/%d' % (index, len(recommenders)))
        print('Recommender: %s' % recommender)
        Constants.update_properties({'carskit_recommenders': recommender})
        index += 1

        cycle_start = time.time()
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        for fold in range(num_folds):
            # modify_properties_file(fold)
            full_cycle(fold)
        cycle_end = time.time()
        cycle_time = cycle_end - cycle_start
        print("Cycle time = %f seconds" % cycle_time)
示例#39
0
def old_main():

    # modify_properties_file()
    # run_carskit()

    # baseline-Avg recommender: GlobalAvg, UserAvg, ItemAvg, UserItemAvg
    # baseline-Context average recommender: ContextAvg, ItemContextAvg, UserContextAvg
    # baseline-CF recommender: ItemKNN, UserKNN, SlopeOne, PMF, BPMF, BiasedMF, NMF, SVD++
    # baseline-Top-N ranking recommender: SLIM, BPR, RankALS, RankSGD, LRMF
    # CARS - splitting approaches: UserSplitting, ItemSplitting, UISplitting; algorithm options: e.g., usersplitting -traditional biasedmf -minlenu 2 -minleni 2
    # CARS - filtering approaches: SPF, DCR, DCW
    # CARS - independent models: CPTF
    # CARS - dependent-dev models: CAMF_CI, CAMF_CU, CAMF_C, CAMF_CUCI, CSLIM_C, CSLIM_CI, CSLIM_CU, CSLIM_CUCI, GCSLIM_CC
    # CARS - dependent-sim models: CAMF_ICS, CAMF_LCS, CAMF_LCS, CSLIM_ICS, CSLIM_LCS, CSLIM_MCS, GCSLIM_ICS, GCSLIM_LCS, GCSLIM_MCS

    if Constants.CARSKIT_ITEM_RANKING:
        all_recommenders = [
            'globalavg', 'useravg', 'itemavg', 'useritemavg',
            'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf',
            'slim',  # 'bpr',  # 'rankals', 'ranksgd',
            'bpr',
            'lrmf',
            'camf_ci', 'camf_cu',  # 'camf_c',
            'camf_cuci', 'cslim_c', 'cslim_ci',
            'cslim_cu',  # 'cslim_cuci',
            # 'camf_ics',
            ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs',
            ##'cslim_mcs', 'gcslim_ics'
        ]
    else:
        all_recommenders = [
            'globalavg', 'useravg', 'itemavg', 'useritemavg',
            'slopeone', 'pmf', 'biasedmf', 'nmf',
            'camf_ci', 'camf_cu',  # 'camf_c',
            'camf_cuci',
            'bpmf',
        ]

    slow_recommenders = [
        # 'contextavg', 'itemcontextavg', 'usercontextavg',  # broken without context
        ## 'itemknn',
        ## 'userknn',
        ## 'cptf',
        ## 'gcslim_cc',
        ## 'gcslim_lcs',
        ## 'gcslim_mcs',
        ## 'fm'
    ]

    rating_recommenders = [
        'globalavg', 'useravg', 'itemavg', 'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn', 'userknn', 'slopeone', 'pmf', 'biasedmf', 'nmf',
        'cptf',
        'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci',
        'camf_ics', 'camf_lcs', 'camf_mcs',
        'fm'
    ]

    rank_recommenders = [
        'globalavg', 'useravg', 'itemavg', 'useritemavg',
        # 'contextavg', 'itemcontextavg', 'usercontextavg',
        'itemknn', 'userknn', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf',
        'slim', 'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cptf',
        'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        # 'camf_ics',
        'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs',
        'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs',
        'fm'
    ]

    rank_only_recommenders = [
        'slim', 'bpr',  # 'rankals', 'ranksgd',
        'lrmf',
        'cslim_c', 'cslim_ci',
        'cslim_cu',  # 'cslim_cuci',
        'gcslim_cc',
        'cslim_ics', 'cslim_lcs',
        'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs'
    ]

    # recommenders = all_recommenders
    recommenders = ['CAMF_CU']

    print('num recommenders: %d' % len(recommenders))
    index = 1

    for recommender in recommenders:
        print('cycle %d/%d' % (index, len(recommenders)))
        print('Recommender: %s' % recommender)
        Constants.update_properties({'carskit_recommenders': recommender})
        index += 1

        cycle_start = time.time()
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        for fold in range(num_folds):
            # modify_properties_file(fold)
            full_cycle(fold)
        cycle_end = time.time()
        cycle_time = cycle_end - cycle_start
        print("Cycle time = %f seconds" % cycle_time)
示例#40
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-k', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')
    parser.add_argument(
        '-i', '--itemtype', metavar='string', type=str,
        nargs=1, help='The type of items')
    parser.add_argument(
        '-s', '--strategy', metavar='string', type=str,
        nargs=1, help='The evaluation strategy (user_test or rel_plus_n)')
    parser.add_argument(
        '-e', '--evaluationset', metavar='string', type=str,
        nargs=1, help='The evaluation set')
    parser.add_argument(
        '-cf', '--contextformat', metavar='string', type=str, nargs=1,
        help='The strategy to extract the contextual information')
    parser.add_argument(
        '-a', '--algorithm', metavar='string', type=str,
        nargs=1, help='The algorithm used to produce recommendations')
    parser.add_argument(
        '-cp', '--carskitparams', metavar='string', type=str,
        nargs=1, help='The hyperparameters for the CARSKit model')
    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None
    item_type = args.itemtype[0] if args.itemtype is not None else None
    strategy =\
        args.strategy[0] if args.strategy is not None else None
    evaluation_set =\
        args.evaluationset[0] if args.evaluationset is not None else None
    context_format =\
        args.contextformat[0] if args.contextformat is not None else None
    algorithm =\
        args.algorithm[0] if args.algorithm is not None else 'libfm'
    carskit_params =\
        args.carskitparams[0] if args.carskitparams is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})
    if item_type is not None:
        Constants.update_properties(
            {Constants.BUSINESS_TYPE_FIELD: item_type})
    if context_format is not None:
        Constants.update_properties(
            {Constants.CONTEXT_FORMAT_FIELD: context_format})
    if strategy is not None:
        Constants.update_properties(
            {Constants.RIVAL_EVALUATION_STRATEGY_FIELD: strategy})
    if carskit_params is not None:
        Constants.update_properties(
            {Constants.CARSKIT_PARAMETERS_FIELD: carskit_params})
    if algorithm.startswith('carskit_'):
        carskit_recommender = algorithm.split('carskit_')[1]
        Constants.update_properties(
            {'carskit_recommenders': carskit_recommender})
        full_cycle_carskit(evaluation_set)
    elif algorithm == 'libfm':
        full_cycle_libfm(evaluation_set)
    else:
        raise ValueError('Unknown algorithm \'%s\'' % algorithm)
示例#41
0
def main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1