示例#1
0
文件: main.py 项目: swarnamd/yelp
def calculate_topic_stability(records):

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = []

    context_extractor =\
        topic_model_creator.create_topic_model(records, None, None)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = 0.8

    print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS)
    for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1):
        sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return calculate_stability(all_term_rankings)
示例#2
0
    def get_records_to_predict(self, use_random_seeds):

        if use_random_seeds:
            utilities.plant_seeds()

        if Constants.EVALUATION_METRIC == 'topn_recall':
            self.get_records_to_predict_topn()
        elif Constants.EVALUATION_METRIC in ['rmse', 'mae']:
            self.get_records_to_predict_rmse()
        else:
            raise ValueError('Unrecognized evaluation metric')
示例#3
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.load_records()

            if 'yelp' in Constants.ITEM_TYPE:
                self.transform_yelp_records()
            elif 'fourcity' in Constants.ITEM_TYPE:
                self.transform_fourcity_records()

            self.add_integer_ids()
            self.clean_reviews()
            self.remove_duplicate_reviews()
            self.tag_reviews_language()
            self.remove_foreign_reviews()
            self.lemmatize_records()
            self.remove_users_with_low_reviews()
            self.remove_items_with_low_reviews()
            self.count_frequencies()
            self.shuffle_records()
            print('total_records: %d' % len(self.records))
            self.classify_reviews()
            self.build_bag_of_words()
            self.tag_contextual_reviews()
            # self.load_full_records()
            self.build_dictionary()
            self.build_corpus()
            self.label_review_targets()
            self.export_records()

        self.count_specific_generic_ratio()
        # self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
示例#4
0
def main():
    utilities.plant_seeds()

    my_resamplers = [
        None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2',
        'smote_tomek', 'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    document_levels = ['review', 'sentence', 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
    index = 1

    results_list = []

    for document_level in document_levels:

        Constants.DOCUMENT_LEVEL = document_level
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers,
                                                       my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
        '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def analyze_topics():

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    topic_model_string = None
    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model('max')
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    data_frame = DataFrame.from_dict(topic_data)
    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score

    print('probability score: %f' % scores['probability_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores
示例#6
0
def analyze_topics():

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    topic_model_string = None
    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model('max')
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    data_frame = DataFrame.from_dict(topic_data)
    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score

    print('probability score: %f' % scores['probability_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores
示例#7
0
    def run_single_fold(self, parameters):

        fold = parameters['fold']

        Constants.update_properties(parameters)

        Constants.print_properties()

        utilities.plant_seeds()
        self.load()

        records = self.original_records

        # self.plant_seeds()
        total_cycle_time = 0.0
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        split = 1 - (1 / float(num_folds))
        self.records = copy.deepcopy(records)
        if Constants.SHUFFLE_DATA:
            self.shuffle(self.records)

        fold_start = time.time()
        cv_start = float(fold) / num_folds
        print('\nFold: %d/%d' % ((fold + 1), num_folds))

        self.create_tmp_file_names(0, fold)
        self.train_records, self.test_records = \
            ETLUtils.split_train_test_copy(
                self.records, split=split, start=cv_start)
        # subsample_size = int(len(self.train_records)*0.5)
        # self.train_records = self.train_records[:subsample_size]
        self.get_records_to_predict(True)
        if Constants.USE_CONTEXT:
            if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
                self.load_cache_context_topics(None, None)
            else:
                context_extractor = self.train_topic_model(0, fold)
                self.find_reviews_topics(context_extractor, 0, fold)
        else:
            self.context_rich_topics = []
        self.predict()
        metrics = self.evaluate()

        fold_end = time.time()
        fold_time = fold_end - fold_start
        total_cycle_time += fold_time
        self.clear()
        print("Total fold %d time = %f seconds" % ((fold + 1), fold_time))

        return metrics
示例#8
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.preprocess()

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
示例#9
0
def main():
    utilities.plant_seeds()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    context_extractor = \
        topic_model_creator.create_topic_model(records, None, None)

    topic_latex_generator = TopicLatexGenerator(context_extractor)
    topic_latex_generator.generate_pdf()
示例#10
0
def main():
    utilities.plant_seeds()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    context_extractor = \
        topic_model_creator.create_topic_model(records, None, None)

    topic_latex_generator = TopicLatexGenerator(context_extractor)
    topic_latex_generator.generate_pdf()
示例#11
0
def test_classifier(x_matrix, y_vector, sampler_type, my_classifier):
    utilities.plant_seeds()

    results = {
        'resampler': sampler_type,
        'classifier': type(my_classifier).__name__
    }
    resampled_x, resampled_y = resample(x_matrix, y_vector, sampler_type)
    print('num samples: %d' % len(resampled_y))

    y_predictions, y_true_values = cross_validation_predict(
        my_classifier, resampled_x, resampled_y, 10, sampler_type, 'predict')
    results.update(print_confusion_matrix(y_true_values, y_predictions))

    y_probabilities, y_true_values = cross_validation_predict(
        my_classifier, resampled_x, resampled_y, 10, sampler_type,
        'predict_proba')
    y_probabilities = y_probabilities[:, 1]

    results.update(plot_roc_curve(y_true_values, y_probabilities))

    # importances = my_classifier.feature_importances_
    # indices = numpy.argsort(importances)[::-1]
    #
    # # Print the feature ranking
    # print("Feature ranking:")
    #
    # for f in range(x_matrix.shape[1]):
    #     print(
    #         "%d. feature %d (%f)" %
    #         (f + 1, indices[f], importances[indices[f]]))

    # std = numpy.std([
    #         tree.feature_importances_ for tree in my_classifier.estimators_],
    #     axis=0
    # )
    #
    # # Plot the feature importances of the forest
    # plt.figure()
    # plt.title("Feature importances")
    # plt.bar(range(x_matrix.shape[1]), importances[indices],
    #         color="r", yerr=std[indices], align="center")
    # plt.xticks(range(x_matrix.shape[1]), indices)
    # plt.xlim([-1, x_matrix.shape[1]])
    # plt.show()

    return results
示例#12
0
def full_cycle():

    utilities.plant_seeds()

    print("%s: Starting process" % (time.strftime("%Y/%m/%d-%H:%M:%S")))
    train, test = load_data()
    train = rescale_sale_price(train)
    train = remove_outliers(train)
    all_features, train_labels = split_features(train, test)
    all_features = normalize_numeric_features(all_features)
    X, X_test = create_x_sets(all_features, train_labels)
    print("%s: Finished data preprocessing" %
          (time.strftime("%Y/%m/%d-%H:%M:%S")))
    ridge_model, svr_model, gbr_model, xgb_model, rf_model, stack_gen_model = train_models(
        X, train_labels)
    print("%s: Finished training models" %
          (time.strftime("%Y/%m/%d-%H:%M:%S")))
示例#13
0
    def run(self):

        utilities.plant_seeds()
        self.load()

        records = self.original_records

        if Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
            num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
            cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
            split = 1 - (1 / float(num_folds))
            cv_start = float(cycle) / num_folds
            print('cv_start', cv_start)
            records, _ = ETLUtils.split_train_test(self.original_records,
                                                   split, cv_start)
            return self.perform_cross_validation(records)
        elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
            return self.perform_cross_validation(records)
        else:
            raise ValueError('Unknown cross-validation strategy')
示例#14
0
def create_topic_model(num_topics):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10,
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics
    })
    utilities.plant_seeds()
    Constants.print_properties()

    file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

    if os.path.exists(file_path):
        print('Ensemble topic model already exists')
        return

    # topic_ensemble_caller.run_local_parse_directory()
    topic_ensemble_caller.run_generate_kfold()
    topic_ensemble_caller.run_combine_nmf()
示例#15
0
def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)
示例#16
0
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
示例#17
0
def create_topic_model(num_topics):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10,
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
        num_topics
    })
    utilities.plant_seeds()
    Constants.print_properties()

    file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

    if os.path.exists(file_path):
        print('Ensemble topic model already exists')
        return

    # topic_ensemble_caller.run_local_parse_directory()
    topic_ensemble_caller.run_generate_kfold()
    topic_ensemble_caller.run_combine_nmf()
示例#18
0
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError(
            'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
示例#19
0
    def perform_cross_validation(self):

        Constants.print_properties()

        utilities.plant_seeds()

        total_recall = 0.0
        total_specific_recall = 0.0
        total_generic_recall = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1/float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)
            self.reviews = copy.deepcopy(self.original_reviews)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j+1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records = \
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.train_reviews, self.test_reviews = \
                    ETLUtils.split_train_test_copy(
                        self.reviews, split=split, start=cv_start)
                self.export()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_word_model()
                    self.find_reviews_topics(lda_based_context)
                self.prepare()
                self.predict()
                self.evaluate()
                recall = self.top_n_evaluator.recall
                specific_recall = self.top_n_evaluator.specific_recall
                generic_recall = self.top_n_evaluator.generic_recall
                total_recall += recall
                total_specific_recall += specific_recall
                total_generic_recall += generic_recall

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j+1), fold_time))

        average_recall = total_recall / total_iterations
        average_specific_recall = total_specific_recall / total_iterations
        average_generic_recall = total_generic_recall / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average recall: %f' % average_recall)
        print('average specific recall: %f' % average_specific_recall)
        print('average generic recall: %f' % average_generic_recall)
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        results = Constants.get_properties_copy()
        results['recall'] = average_recall
        results['specific_recall'] = average_specific_recall
        results['generic_recall'] = average_generic_recall
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        if not os.path.exists(Constants.CSV_RESULTS_FILE):
            with open(Constants.CSV_RESULTS_FILE, 'wb') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writeheader()
                w.writerow(results)
        else:
            with open(Constants.CSV_RESULTS_FILE, 'a') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writerow(results)
示例#20
0
def analyze_topics(include_stability=True):

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model(num_terms)
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    # generate_excel_file(topic_data)
    data_frame = DataFrame.from_dict(topic_data)

    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score
    high_ratio_mean_score = data_frame[(
        data_frame.ratio >
        Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean()
    low_ratio_mean_score = data_frame[(
        data_frame.ratio <
        Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean()

    stability = None
    sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO
    if include_stability:
        stability = calculate_topic_stability(records, sample_ratio).mean()
    scores['stability'] = stability

    # separation_score =\
    #     (high_ratio_mean_score / low_ratio_mean_score)\
    #     if low_ratio_mean_score != 0\
    #     else 'N/A'
    gamma = 0.5
    separation_score = gamma * high_ratio_mean_score + (1 - gamma) * (
        1 - low_ratio_mean_score)
    joint_separation_score =\
        (high_ratio_mean_score + (1 - low_ratio_mean_score)) / 2
    scores['separation_score'] = separation_score
    scores['joint_separation_score'] = joint_separation_score
    scores['combined_score'] =\
        (probability_score * separation_score)\
        if probability_score != 'N/A' and separation_score != 'N/A'\
        else 'N/A'

    print('probability score: %f' % scores['probability_score'])
    print('separation score:', scores['separation_score'])
    print('combined score:', scores['combined_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores
示例#21
0
def analyze_topics(include_stability=True):

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model(num_terms)
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    # generate_excel_file(topic_data)
    data_frame = DataFrame.from_dict(topic_data)

    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score
    high_ratio_mean_score = data_frame[
        (data_frame.ratio > Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean()
    low_ratio_mean_score = data_frame[
        (data_frame.ratio < Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean()

    stability = None
    sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO
    if include_stability:
        stability = calculate_topic_stability(records, sample_ratio).mean()
    scores['stability'] = stability

    # separation_score =\
    #     (high_ratio_mean_score / low_ratio_mean_score)\
    #     if low_ratio_mean_score != 0\
    #     else 'N/A'
    gamma = 0.5
    separation_score = gamma*high_ratio_mean_score + (1 - gamma)*(1-low_ratio_mean_score)
    joint_separation_score =\
        (high_ratio_mean_score + (1 - low_ratio_mean_score)) / 2
    scores['separation_score'] = separation_score
    scores['joint_separation_score'] = joint_separation_score
    scores['combined_score'] =\
        (probability_score * separation_score)\
        if probability_score != 'N/A' and separation_score != 'N/A'\
        else 'N/A'

    print('probability score: %f' % scores['probability_score'])
    print('separation score:', scores['separation_score'])
    print('combined score:', scores['combined_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores