Exemplos de ETLUtils em Python, exemplos de etl.ETLUtils em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: labeled_reviews_comparator.py Projeto: swarnamd/yelp

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
    # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(classifier_records,
                                                     Constants.REVIEW_ID_FIELD,
                                                     non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[
            reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: labeled_reviews_comparator.py Projeto: melqkiades/yelp

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: reviews_classifier.py Projeto: bachlog/yelp

def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: context_data_converter.py Projeto: bachlog/yelp

    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

Exemplo n.º 5

0

Exibir arquivo

 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(self.items_to_predict, write_file,
                     pickle.HIGHEST_PROTOCOL)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: review_analysis.py Projeto: antoine-tran/yelp

    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)


        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: top_n_runner.py Projeto: bachlog/yelp

def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall

Exemplo n.º 8

0

Exibir arquivo

    def export_without_context(self):
        print('%s: exporting to CARSKit binary ratings format without context' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        numpy.random.seed(0)

        for record in self.records:

            context_na_value = 1

            new_records.append({
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
                'context:na': context_na_value,
            })

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]

        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: context_transformer.py Projeto: swarnamd/yelp

    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: yelp_reviews_preprocessor.py Projeto: neostoic/yelp-1

 def export_records(self):
     print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
     self.dictionary.save(Constants.DICTIONARY_FILE)
     ETLUtils.save_json_file(
         Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
     self.drop_unnecessary_fields()
     ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: top_n_evaluator.py Projeto: melqkiades/yelp

 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(
             self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)

Exemplo n.º 12

0

Exibir arquivo

def create_topic_models():

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.RECORDS_FILE)

    plant_seeds()
    num_cycles = Constants.NUM_CYCLES
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1 / float(num_folds))

    for i in range(num_cycles):

        print('\n\nCycle: %d/%d' % ((i + 1), num_cycles))

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

        train_records_list = []

        for j in range(num_folds):

            cv_start = float(j) / num_folds

            train_records, test_records =\
                ETLUtils.split_train_test(records, split=split, start=cv_start)
            train_records_list.append(train_records)

        args = zip(train_records_list,
                   [i] * Constants.CROSS_VALIDATION_NUM_FOLDS,
                   range(Constants.CROSS_VALIDATION_NUM_FOLDS))

        parallel_context_top_n(args)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: business_etl.py Projeto: antoine-tran/yelp

    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes',
            'business_id',
            'categories',
            'city',
            'full_address',
            'latitude',
            'longitude',
            'hours',
            'name',
            'neighborhoods',
            'open',
            'review_count',
            'stars',
            'state',
            'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)

Exemplo n.º 14

0

Exibir arquivo

    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)

        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: context_recommender_tests.py Projeto: neostoic/yelp-1

def run_top_n_test(records_file,
                   recommenders,
                   binary_reviews_file,
                   reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')

Exemplo n.º 16

0

Exibir arquivo

Arquivo: main.py Projeto: swarnamd/yelp

def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])

Exemplo n.º 17

0

Exibir arquivo

def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: business_clusterer.py Projeto: anuragreddygv323/yelp

def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()

Exemplo n.º 19

0

Exibir arquivo

Arquivo: parallel_context_recommender_tests.py Projeto: antoine-tran/yelp

def parallel_run_topn_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product(
        [records],
        recommenders,
        [top_n],
        [num_folds],
        [split],
        [min_like_score],
        [binary_reviews],
        [reviews_type]
    )

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(context_recommender_tests.process_topn_results(
            recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

    return results_list

Exemplo n.º 20

0

Exibir arquivo

Arquivo: classifier_evaluator.py Projeto: antoine-tran/yelp

def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None,
        'random_over_sampler',
        'smote_regular',
        'smote_bl1',
        'smote_bl2',
        'smote_tomek',
        'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    document_levels = ['review', 'sentence', 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
    index = 1

    results_list = []

    for document_level in document_levels:

        Constants.DOCUMENT_LEVEL = document_level
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers, my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())

Exemplo n.º 21

0

Exibir arquivo

    def test_select_fields(self):

        select_fields = ['user_id', 'offering_id', 'overall_rating']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
        self.assertEqual(result, reviews_matrix_5_short)

        select_fields = ['user_id']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
        self.assertEqual(result, reviews_matrix_5_users)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_etl_utils.py Projeto: antoine-tran/yelp

    def test_select_fields(self):

        select_fields = ['user_id', 'offering_id', 'overall_rating']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
        self.assertEqual(result, reviews_matrix_5_short)

        select_fields = ['user_id']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
        self.assertEqual(result, reviews_matrix_5_users)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: parallel_context_recommender_tests.py Projeto: neostoic/yelp-1

def parallel_run_topn_test(records_file,
                           recommenders,
                           binary_reviews_file,
                           reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product([records], recommenders, [top_n], [num_folds],
                             [split], [min_like_score], [binary_reviews],
                             [reviews_type])

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            context_recommender_tests.process_topn_results(
                recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')

    return results_list

Exemplo n.º 24

0

Exibir arquivo

def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2',
        'smote_tomek', 'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    max_sentences_list = [None, 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(
        max_sentences_list)
    index = 1

    results_list = []

    for max_sentences in max_sentences_list:

        Constants.MAX_SENTENCES = max_sentences
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers,
                                                       my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())

Exemplo n.º 25

0

Exibir arquivo

    def drop_unnecessary_fields(self):
        print(
            '%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            # Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: context_recommender_tests.py Projeto: antoine-tran/yelp

def run_top_n_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) - count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

Exemplo n.º 27

0

Exibir arquivo

Arquivo: yelp_reviews_preprocessor.py Projeto: neostoic/yelp-1

    def drop_unnecessary_fields(self):
        print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            Constants.VOTES_FIELD,
            Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: basic_knn.py Projeto: antoine-tran/yelp

def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records

Exemplo n.º 29

0

Exibir arquivo

Arquivo: context_top_n_runner.py Projeto: antoine-tran/yelp

    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)

Exemplo n.º 30

0

Exibir arquivo

def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars', 'text', 'review_id']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records

Exemplo n.º 31

0

Exibir arquivo

    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)

Exemplo n.º 32

0

Exibir arquivo

    def remove_reviews_from_classifier_training_set(self):
        """
        Removes the records that are part of the training set of the reviews
        classifier
        """
        classifier_records = \
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
        classifier_review_ids = \
            {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}

        self.records = ETLUtils.filter_out_records(
            self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)

Exemplo n.º 33

0

Exibir arquivo

    def export_as_predefined_context(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []

        context_categories = utilities.context_words[Constants.ITEM_TYPE].keys()
        context_headers = [
            'context:%s' % category for category in context_categories]

        index = 0

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            review_categories = \
                find_predefined_context(record[Constants.BOW_FIELD])

            context_found = False
            for category in context_categories:
                category_key = 'context:' + category
                category_value = 0
                if category in review_categories:
                    category_value = 1
                    context_found = True
                new_record[category_key] = category_value

            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)
            index += 1

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: context_recommender_tests.py Projeto: neostoic/yelp-1

def run_rmse_test(records_file,
                  recommenders,
                  binary_reviews_file,
                  reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\n%d/%d\n**************' %
              (count, len(recommenders)))
        results = recommender_evaluator.perform_cross_validation(
            records, recommender, num_folds, binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_rmse_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-rmse-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS,
                           '\t')

Exemplo n.º 35

0

Exibir arquivo

    def test_drop_fields(self):

        drop_fields = [
            'cleanliness_rating', 'location_rating', 'rooms_rating',
            'service_rating', 'value_rating'
        ]

        test_list = list(reviews_matrix_5)

        ETLUtils.drop_fields(drop_fields, test_list)
        self.assertEqual(reviews_matrix_5_short, test_list)

        test_list = list(reviews_matrix_5_short)
        self.assertEqual(reviews_matrix_5_short, test_list)

Exemplo n.º 36

0

Exibir arquivo

    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)

Exemplo n.º 37

0

Exibir arquivo

    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor

Exemplo n.º 38

0

Exibir arquivo

Arquivo: business_etl.py Projeto: neostoic/yelp-1

    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes', 'business_id', 'categories', 'city', 'full_address',
            'latitude', 'longitude', 'hours', 'name', 'neighborhoods', 'open',
            'review_count', 'stars', 'state', 'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)

Exemplo n.º 39

0

Exibir arquivo

    def export_as_top_word(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'top words' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        topic_model_string = self.topic_extractor.print_topic_model()
        top_terms = [get_topic_terms(topic) for topic in topic_model_string]
        context_headers = ['context:%s' % term[0] for term in top_terms]

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            topics = record[self.topics_field]
            context_found = False

            for topic in topics:
                topic_index = topic[0]
                topic_weight = topic[1]

                context_key = context_headers[topic_index]
                context_value = 1 if topic_weight > 0.0 else 0

                new_record[context_key] = context_value
            # print(new_record)
            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)

Exemplo n.º 40

0

Exibir arquivo

Arquivo: yelp_reviews_preprocessor.py Projeto: antoine-tran/yelp

    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: precision_in_top_n.py Projeto: neostoic/yelp-1

def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds):

    start_time = time.time()
    split = 1 - (1 / float(num_folds))
    total_precision = 0.
    num_cycles = 0

    for i in xrange(0, num_folds):
        print('Fold', i)
        start = float(i) / num_folds
        train, test = ETLUtils.split_train_test(reviews,
                                                split=split,
                                                start=start)
        recommender.load(train)
        user_ids = recommender.user_ids

        for user_id in user_ids:
            precision = calculate_recommender_precision(
                test, user_id, recommender, n, min_score)

            if precision is not None:
                total_precision += precision
                num_cycles += 1

    final_precision = total_precision / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_precision)
    print("--- %s seconds ---" % execution_time)

    result = {'Top N': final_precision, 'Execution time': execution_time}

    return result

Exemplo n.º 42

0

Exibir arquivo

def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]

Exemplo n.º 43

0

Exibir arquivo

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)

Exemplo n.º 44

0

Exibir arquivo

Arquivo: extractor.py Projeto: antoine-tran/yelp

def initialize_cluster_users(reviews, significant_criteria_ranges=None):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.criteria_weights = get_criteria_weights(
            user_reviews, user_id, apply_filter=False)
        _, user.cluster = get_significant_criteria(
            user.criteria_weights, significant_criteria_ranges)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user.item_multi_ratings = get_user_item_multi_ratings(user_reviews, user_id)
        user_dictionary[user_id] = user

    # print('Total users: %i' % len(user_ids))

    return user_dictionary

Exemplo n.º 45

0

Exibir arquivo

Arquivo: reviews_dataset_analyzer.py Projeto: anuragreddygv323/yelp

    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError("Can not determine the sparsity for an empty list")

        user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        item_ids = extractor.get_groupby_list(self.reviews, "offering_id")

        non_missing_reviews = 0.0
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user])
            user_items = extractor.get_groupby_list(user_reviews, "offering_id")

            non_missing_reviews += len(set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews

Exemplo n.º 46

0

Exibir arquivo

Arquivo: extractor.py Projeto: antoine-tran/yelp

def get_user_item_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    if not user_reviews:
        return {}

    data_frame = DataFrame(user_reviews)
    column = 'offering_id'
    counts = data_frame.groupby(column).mean()

    items = counts.index.get_level_values(0).tolist()
    items_ratings = {}

    for item, mean in zip(items, counts['overall_rating']):
        items_ratings[item] = mean

    return items_ratings

Exemplo n.º 47

0

Exibir arquivo

Arquivo: main.py Projeto: swarnamd/yelp

def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')

Exemplo n.º 48

0

Exibir arquivo

Arquivo: reviews_clusterer.py Projeto: antoine-tran/yelp

def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]

Exemplo n.º 49

0

Exibir arquivo

Arquivo: yelp_reviews_preprocessor.py Projeto: neostoic/yelp-1

    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)

Exemplo n.º 50

0

Exibir arquivo

    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError(
                'Can not determine the sparsity for an empty list')

        user_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.USER_ID_FIELD)
        item_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.ITEM_ID_FIELD)

        non_missing_reviews = 0.
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews,
                                                   Constants.USER_ID_FIELD,
                                                   [user])
            user_items = extractor.get_groupby_list(user_reviews,
                                                    Constants.ITEM_ID_FIELD)

            non_missing_reviews += len(
                set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews

Exemplo n.º 51

0

Exibir arquivo

Arquivo: word_context_top_n_runner.py Projeto: swarnamd/yelp

    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)

Exemplo n.º 52

0

Exibir arquivo

def get_ml_100K_dataset():
    # records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-1k.csv', '\t')
    records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-100k.csv', '\t')
    # records = ETLUtils.load_csv_file('/Users/fpena/UCC/Thesis/datasets/uncompressed/ml-100k.csv', '\t')
    for record in records:
        record['overall_rating'] = float(record['overall_rating'])
    return records