Exemplo n.º 1
0
    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
Exemplo n.º 2
0
    def export_without_context(self):
        print('%s: exporting to CARSKit binary ratings format without context' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        numpy.random.seed(0)

        for record in self.records:

            context_na_value = 1

            new_records.append({
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
                'context:na': context_na_value,
            })

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]

        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
Exemplo n.º 3
0
    def prepare(self):
        print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))

        contextual_train_set =\
            ETLUtils.select_fields(self.headers, self.train_records)
        contextual_test_set =\
            ETLUtils.select_fields(self.headers, self.records_to_predict)

        ETLUtils.save_csv_file(
            self.csv_train_file, contextual_train_set, self.headers)
        ETLUtils.save_csv_file(
            self.csv_test_file, contextual_test_set, self.headers)

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        num_cols = len(self.headers)
        context_cols = num_cols
        print('num_cols', num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True,
            suffix='.no_context.libfm')
        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.context.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
Exemplo n.º 4
0
def run_top_n_test(records_file,
                   recommenders,
                   binary_reviews_file,
                   reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')
def parallel_run_topn_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product(
        [records],
        recommenders,
        [top_n],
        [num_folds],
        [split],
        [min_like_score],
        [binary_reviews],
        [reviews_type]
    )

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(context_recommender_tests.process_topn_results(
            recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

    return results_list
Exemplo n.º 6
0
def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None,
        'random_over_sampler',
        'smote_regular',
        'smote_bl1',
        'smote_bl2',
        'smote_tomek',
        'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    document_levels = ['review', 'sentence', 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
    index = 1

    results_list = []

    for document_level in document_levels:

        Constants.DOCUMENT_LEVEL = document_level
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers, my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def parallel_run_topn_test(records_file,
                           recommenders,
                           binary_reviews_file,
                           reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product([records], recommenders, [top_n], [num_folds],
                             [split], [min_like_score], [binary_reviews],
                             [reviews_type])

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            context_recommender_tests.process_topn_results(
                recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')

    return results_list
Exemplo n.º 8
0
def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2',
        'smote_tomek', 'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    max_sentences_list = [None, 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(
        max_sentences_list)
    index = 1

    results_list = []

    for max_sentences in max_sentences_list:

        Constants.MAX_SENTENCES = max_sentences
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers,
                                                       my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def run_top_n_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) - count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
Exemplo n.º 10
0
    def export_as_predefined_context(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []

        context_categories = utilities.context_words[Constants.ITEM_TYPE].keys()
        context_headers = [
            'context:%s' % category for category in context_categories]

        index = 0

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            review_categories = \
                find_predefined_context(record[Constants.BOW_FIELD])

            context_found = False
            for category in context_categories:
                category_key = 'context:' + category
                category_value = 0
                if category in review_categories:
                    category_value = 1
                    context_found = True
                new_record[category_key] = category_value

            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)
            index += 1

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
Exemplo n.º 11
0
def run_rmse_test(records_file,
                  recommenders,
                  binary_reviews_file,
                  reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\n%d/%d\n**************' %
              (count, len(recommenders)))
        results = recommender_evaluator.perform_cross_validation(
            records, recommender, num_folds, binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_rmse_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-rmse-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS,
                           '\t')
Exemplo n.º 12
0
    def export_as_top_word(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'top words' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        topic_model_string = self.topic_extractor.print_topic_model()
        top_terms = [get_topic_terms(topic) for topic in topic_model_string]
        context_headers = ['context:%s' % term[0] for term in top_terms]

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            topics = record[self.topics_field]
            context_found = False

            for topic in topics:
                topic_index = topic[0]
                topic_weight = topic[1]

                context_key = context_headers[topic_index]
                context_value = 1 if topic_weight > 0.0 else 0

                new_record[context_key] = context_value
            # print(new_record)
            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
Exemplo n.º 13
0
def run_rmse_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\n%d/%d\n**************' %
              (count, len(recommenders)))
        results = recommender_evaluator.perform_cross_validation(
            records, recommender, num_folds, binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) - count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(process_rmse_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-rmse-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS, '\t')
Exemplo n.º 14
0
    def export_as_all_words(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'all words' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        all_terms = set()
        for record in self.records:
            all_terms |= set(record[Constants.BOW_FIELD])

        all_terms = [remove_accents(term) for term in all_terms]

        context_headers = ['context:%s' % term for term in all_terms]

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            bag_of_words = record[Constants.BOW_FIELD]

            for term, context_header in zip(all_terms, context_headers):
                context_value = 1 if term in bag_of_words > 0.0 else 0

                new_record[context_header] = context_value

            new_records.append(new_record)

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
        ]
        headers.extend(context_headers)
        print(len(headers))
        print(headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
Exemplo n.º 15
0
def add_extra_column_to_csv():

    csv_file_name = '/tmp/results/rival_yelp_restaurant_results_folds_4.csv'

    records = ETLUtils.load_csv_file(csv_file_name)

    with open(csv_file_name, 'r') as csvinput:
        reader = csv.reader(csvinput)
        headers = next(reader)
        index = headers.index('Evaluation_Set') + 1
        headers.insert(index, Constants.FM_NUM_FACTORS_FIELD)

    print(headers)

    for record in records:
        record[Constants.FM_NUM_FACTORS_FIELD] = 10

    ETLUtils.save_csv_file('/tmp/my_csv_file.csv', records, headers)
Exemplo n.º 16
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(document_term_matrix,
                                          document_topic_matrix,
                                          topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
Exemplo n.º 17
0
    def prepare(self):
        print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(len(self.sense_groups))

        if Constants.USE_CONTEXT is True:
            for record in self.train_records:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            for record in self.records_to_predict:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            if Constants.FM_REVIEW_TYPE:
                self.train_records = ETLUtils.filter_records(
                    self.train_records, Constants.PREDICTED_CLASS_FIELD,
                    [Constants.FM_REVIEW_TYPE])

            # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records)

        ETLUtils.keep_fields(self.headers, self.train_records)
        ETLUtils.keep_fields(self.headers, self.records_to_predict)

        ETLUtils.save_csv_file(
            self.csv_train_file, self.train_records, self.headers)
        ETLUtils.save_csv_file(
            self.csv_test_file, self.records_to_predict, self.headers)

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
Exemplo n.º 18
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(
            document_term_matrix, document_topic_matrix, topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
Exemplo n.º 19
0
def main_converter():

    csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv'
    csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv'

    # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True)
    # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True)

    headers = ['stars', 'user_id', 'business_id']
    train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
    records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE)
    train_records = ETLUtils.select_fields(headers, train_records)
    records_to_predict = ETLUtils.select_fields(headers, records_to_predict)

    ETLUtils.save_csv_file(csv_train_file, train_records, headers)
    ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers)

    csv_files = [
        csv_train_file,
        csv_test_file
    ]

    csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
Exemplo n.º 20
0
def evaluate_recommender_similarity_metrics(reviews, recommender):

    headers = [
        'Algorithm',
        'Multi-cluster',
        'Similarity algorithm',
        'Similarity metric',
        'Num neighbors',
        'Dataset',
        'MAE',
        'RMSE',
        'Top N',
        'Coverage',
        'Execution time',
        'Cross validation',
        'Machine'
    ]
    similarity_metrics = ['euclidean']  # , 'cosine', 'chebyshev', 'manhattan', 'pearson']
    similarity_algorithms = [
        SingleSimilarityMatrixBuilder('euclidean'),
        # AverageSimilarityMatrixBuilder('euclidean'),
        # MultiSimilarityMatrixBuilder('euclidean'),
    ]
    ranges = [
        # [(-1.001, -0.999), (0.999, 1.001)],
        # [(-1.01, -0.99), (0.99, 1.01)],
        # [(-1.05, -0.95), (0.95, 1.05)],
        # [(-1.1, -0.9), (0.9, 1.1)],
        # [(-1.2, -0.8), (0.8, 1.2)],
        # [(-1.3, -0.7), (0.7, 1.3)],
        # [(-1.5, -0.5), (0.5, 1.5)],
        # [(-1.7, -0.3), (0.3, 1.7)],
        # [(-1.9, -0.1), (0.1, 1.9)],
        None
    ]
    num_neighbors_list = [None]  # [None, 1, 3, 5, 10, 20, 30, 40]
    num_folds = 5
    results = []

    for similarity_algorithm in similarity_algorithms:

        for num_neighbors in num_neighbors_list:

            for similarity_metric in similarity_metrics:

                for cluster_range in ranges:

                    recommender._similarity_matrix_builder = similarity_algorithm
                    recommender._similarity_matrix_builder._similarity_metric = similarity_metric
                    recommender._significant_criteria_ranges = cluster_range
                    recommender._num_neighbors = num_neighbors

                    print(
                        recommender.name, recommender._significant_criteria_ranges,
                        recommender._similarity_matrix_builder._name,
                        recommender._similarity_matrix_builder._similarity_metric,
                        recommender._num_neighbors
                    )

                    result = perform_cross_validation(reviews, recommender, num_folds)
                    # result = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)

                    # result['Top N'] = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)['Top N']
                    result['Algorithm'] = recommender.name
                    result['Multi-cluster'] = recommender._significant_criteria_ranges
                    result['Similarity algorithm'] = recommender._similarity_matrix_builder._name
                    result['Similarity metric'] = recommender._similarity_matrix_builder._similarity_metric
                    result['Cross validation'] = 'Folds=' + str(num_folds) + ', Iterations = ' + str(num_folds)
                    result['Num neighbors'] = recommender._num_neighbors
                    result['Specific/Generic'] = recommender._num_neighbors
                    result['Dataset'] = 'Four City'
                    result['Machine'] = 'Mac'
                    results.append(result)

    file_name = '/Users/fpena/tmp/rs-test/test-delete-' + recommender.name + '.csv'
    ETLUtils.save_csv_file(file_name, results, headers)
Exemplo n.º 21
0
    def export_as_topic_predefined_context(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []

        context_categories = utilities.context_words[Constants.ITEM_TYPE].keys()
        context_headers = [
            'context:%s' % category for category in context_categories]
        context_topic_ids = [
            extract_topic_id(topic_name) for topic_name in
            self.records[0][Constants.CONTEXT_TOPICS_FIELD].keys()]
        context_topic_ids = [
            topic for topic in context_topic_ids if topic is not None]
        topic_categories_map = \
            create_topic_categories_map(context_topic_ids, self.topic_extractor)

        print(topic_categories_map)

        index = 0

        # for record in self.records[3:4]:
        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            context_topics = record[Constants.CONTEXT_TOPICS_FIELD]

            # topic_categories = \
            #     find_predefined_context(record[Constants.BOW_FIELD])

            context_found = False
            for category in context_categories:
                category_key = 'context:' + category
                category_value = 0

                for topic_name in context_topics.keys():
                    topic_id = extract_topic_id(topic_name)
                    if topic_id is None:
                        break
                    topic_categories = topic_categories_map[topic_id]
                    if context_topics[topic_name] > 0 and category in topic_categories:
                        category_value = 1
                        context_found = True
                new_record[category_key] = category_value

            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)
            index += 1

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)

        print(new_records[0])
        # print(new_records[10])
        # print(new_records[100])

        # record_index = 0
        # all_context_headers = context_headers + ['context:na']
        # for record in new_records:
        #
        #     context_sum = 0
        #     for header in all_context_headers:
        #         context_sum += record[header]
        #     record_index += 1
        #     print(record_index, context_sum)

        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
Exemplo n.º 22
0
def evaluate_recommender_similarity_metrics(reviews, recommender):

    headers = [
        "Algorithm",
        "Multi-cluster",
        "Similarity algorithm",
        "Similarity metric",
        "Num neighbors",
        "Dataset",
        "MAE",
        "RMSE",
        "Top N",
        "Coverage",
        "Execution time",
        "Cross validation",
        "Machine",
    ]
    similarity_metrics = ["euclidean"]  # , 'cosine', 'chebyshev', 'manhattan', 'pearson']
    similarity_algorithms = [
        SingleSimilarityMatrixBuilder("euclidean"),
        # AverageSimilarityMatrixBuilder('euclidean'),
        # MultiSimilarityMatrixBuilder('euclidean'),
    ]
    ranges = [
        # [(-1.001, -0.999), (0.999, 1.001)],
        # [(-1.01, -0.99), (0.99, 1.01)],
        # [(-1.05, -0.95), (0.95, 1.05)],
        # [(-1.1, -0.9), (0.9, 1.1)],
        # [(-1.2, -0.8), (0.8, 1.2)],
        # [(-1.3, -0.7), (0.7, 1.3)],
        # [(-1.5, -0.5), (0.5, 1.5)],
        # [(-1.7, -0.3), (0.3, 1.7)],
        # [(-1.9, -0.1), (0.1, 1.9)],
        None
    ]
    num_neighbors_list = [None]  # [None, 1, 3, 5, 10, 20, 30, 40]
    num_folds = 5
    results = []

    for similarity_algorithm in similarity_algorithms:

        for num_neighbors in num_neighbors_list:

            for similarity_metric in similarity_metrics:

                for cluster_range in ranges:

                    recommender._similarity_matrix_builder = similarity_algorithm
                    recommender._similarity_matrix_builder._similarity_metric = similarity_metric
                    recommender._significant_criteria_ranges = cluster_range
                    recommender._num_neighbors = num_neighbors

                    print(
                        recommender.name,
                        recommender._significant_criteria_ranges,
                        recommender._similarity_matrix_builder._name,
                        recommender._similarity_matrix_builder._similarity_metric,
                        recommender._num_neighbors,
                    )

                    result = perform_cross_validation(reviews, recommender, num_folds)
                    # result = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)

                    # result['Top N'] = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)['Top N']
                    result["Algorithm"] = recommender.name
                    result["Multi-cluster"] = recommender._significant_criteria_ranges
                    result["Similarity algorithm"] = recommender._similarity_matrix_builder._name
                    result["Similarity metric"] = recommender._similarity_matrix_builder._similarity_metric
                    result["Cross validation"] = "Folds=" + str(num_folds) + ", Iterations = " + str(num_folds)
                    result["Num neighbors"] = recommender._num_neighbors
                    result["Specific/Generic"] = recommender._num_neighbors
                    result["Dataset"] = "Four City"
                    result["Machine"] = "Mac"
                    results.append(result)

    file_name = "/Users/fpena/tmp/rs-test/test-delete-" + recommender.name + ".csv"
    ETLUtils.save_csv_file(file_name, results, headers)