def PULearning_test(samples_positive, samples_unlabeled): threshold_pd_word = 0.4 threshold_speciality = 0.8 threshold_popularity = 0.01 tsm_positive = samples_positive.tsm tsm_unlabeled = samples_unlabeled.tsm terms_positive_degree = select_features_by_positive_degree(tsm_positive, tsm_unlabeled, (threshold_pd_word, threshold_speciality, threshold_popularity)) vocabulary = samples_positive.corpus.vocabulary pd.save_terms_positive_degree(terms_positive_degree, vocabulary, "./result/keywords.txt") #samples_positive_degree_P = pd.calculate_samples_positive_degree(tsm_positive, terms_positive_degree, max_terms = 20) #pd.save_samples_positive_degree(samples_positive, samples_positive_degree_P) samples_positive_degree_U = pd.calculate_samples_positive_degree(tsm_unlabeled, terms_positive_degree, max_terms = 20) pd.save_samples_positive_degree(samples_unlabeled, samples_positive_degree_U)
def show_category_keywords(self, result_dir): if not os.path.isdir(result_dir): try: os.mkdir(result_dir) except OSError: logging.error(Logger.error("mkdir %s failed." % (result_dir))) return tsm = self.tsm categories = self.get_categories() for category_name in categories.categories_2: category_id = categories.categories_2[category_name] positive_samples_list, unlabeled_samples_list = tsm.get_samples_list_by_category_2(category_id) print "%s(%d) Positive Samples: %d Unlabeled Samples: %d" % (category_name, category_id, len(positive_samples_list), len(unlabeled_samples_list)) terms_positive_degree = get_terms_positive_degree_by_category(tsm, positive_samples_list, unlabeled_samples_list) pd.save_terms_positive_degree(terms_positive_degree, self.corpus.vocabulary, "%s/keywords_%d_%s.txt" % (result_dir, category_id, category_name)) samples_positive = None samples_unlabeled = None