def do_query_sample(corpus_dir, samples_name, sample_id): corpus = Corpus(corpus_dir) corpus.vocabulary.load() samples = Samples(corpus, samples_name) samples.load() samples.query_by_id(sample_id)
def do_purge(corpus_dir, samples_name): corpus = Corpus(corpus_dir) samples = Samples(corpus, samples_name) samples.load() Fix(samples).purge()
def do_rebuild_categories(corpus_dir, samples_name): corpus = Corpus(corpus_dir) samples = Samples(corpus, samples_name) samples.load() logging.debug(Logger.debug("Rebuild base data...")) Fix(samples).fix_categories() samples.rebuild_categories()
def do_predict(corpus_dir, samples_name, model_name, result_dir): corpus = Corpus(corpus_dir) corpus.vocabulary.load() samples = Samples(corpus, samples_name) samples.load() logging.debug(Logger.debug("Predicting ...")) multicategories_predict(samples, model_name, result_dir)
def do_query_categories(corpus_dir, samples_name, xls_file): corpus = Corpus(corpus_dir) samples = Samples(corpus, samples_name) samples.load() samples.query_categories(xls_file) logging.info(Logger.info("Query categories %s/<%s> Done. %s" % (corpus_dir, samples_name, xls_file)))
def do_query_keywords(corpus_dir, samples_name, result_dir): corpus = Corpus(corpus_dir) samples = Samples(corpus, samples_name) samples.load() samples.show_category_keywords(result_dir) # samples.show_keywords_matrix() logging.info(Logger.info("Query keywords %s/<%s> Done. %s" % (corpus_dir, samples_name, result_dir)))
def do_show(corpus_dir, samples_name): if (not corpus_dir) is None and (not samples_name is None): corpus = Corpus(corpus_dir) #corpus.vocabulary.load() samples = Samples(corpus, samples_name) samples.load() samples.show()
def do_show(corpus_dir, samples_name): if corpus_dir is None or samples_name is None: return corpus = Corpus(corpus_dir) corpus.vocabulary.load() samples = Samples(corpus, samples_name) samples.load() samples.show()
def do_query_sample_by_pu(corpus_dir, positive_name_list, unlabeled_name, sample_id): corpus = Corpus(corpus_dir) samples_positive = None for positive_name in positive_name_list: samples = Samples(corpus, positive_name) samples.load() if samples_positive is None: samples_positive = samples else: samples_positive.merge(samples) samples = None samples_unlabeled = Samples(corpus, unlabeled_name) samples_unlabeled.load() corpus.query_by_id(samples_positive, samples_unlabeled, sample_id)
def do_iem(corpus_dir, positive_name, unlabeled_name, result_dir): corpus = Corpus(corpus_dir) corpus.vocabulary.load() samples_positive = Samples(corpus, positive_name) samples_positive.load() #samples_unlabeled = Samples(corpus, unlabeled_name) #samples_unlabeled.load() logging.debug(Logger.debug("I-EM ...")) positive_category_id = 4000000 positive_ratio = 0.8 tsm = samples_positive.tsm positive_samples_list, unlabeled_samples_list = tsm.crossvalidation_by_category_1(positive_category_id, positive_ratio, random = False) tsm_positive = tsm.clone(positive_samples_list) tsm_unlabeled = tsm.clone(unlabeled_samples_list) rn_iem(positive_category_id, tsm_positive, tsm_unlabeled, result_dir)
def do_pulearning(corpus_dir, positive_name, unlabeled_name, result_dir): corpus = Corpus(corpus_dir) # corpus.vocabulary.load() samples_positive = Samples(corpus, positive_name) samples_positive.load() # samples_positive = None # for positive_name in positive_name_list: # samples = Samples(corpus, positive_name) # samples.load() # if samples_positive is None: # samples_positive = samples # else: # samples_positive.merge(samples) # samples = None samples_unlabeled = Samples(corpus, unlabeled_name) samples_unlabeled.load() PULearning_test(samples_positive, samples_unlabeled)
def do_sem(corpus_dir, positive_name, unlabeled_name, result_dir): corpus = Corpus(corpus_dir) corpus.vocabulary.load() samples_positive = Samples(corpus, positive_name) samples_positive.load() #samples_unlabeled = Samples(corpus, unlabeled_name) #samples_unlabeled.load() logger.debug(Logger.debug("S-EM ...")) #positive_category_id = 1000000 # 供电服务 #positive_category_id = 2000000 # 人资管理 positive_category_id = 6000000 # 安全生产 #positive_category_id = 6000000 # 党建作风 #positive_category_id = 8000000 # 依法治企 positive_ratio = 0.4 negative_ratio = 0.66 # ratio of remaing samples. (1 - positive_ratio) * negative_ratio tsm = samples_positive.tsm #for sample_id in tsm.sample_matrix(): #category_id = tsm.get_sample_category(sample_id) #print sample_id, category_id positive_samples_list, unlabeled_samples_list = tsm.crossvalidation_by_category_1(positive_category_id, positive_ratio, negative_ratio, positive_random = False, negative_random = False) tsm_positive = tsm.clone(positive_samples_list) tsm_unlabeled = tsm.clone(unlabeled_samples_list) #print positive_samples_list #print unlabeled_samples_list total_positive_samples = tsm_positive.get_total_samples() total_unlabeled_samples = tsm_unlabeled.get_total_samples() logging.debug(Logger.debug("do_sem() %d samples in tsm_positive, %d samples in tsm_unlabeled." % (total_positive_samples, total_unlabeled_samples))) #for sample_id in tsm_unlabeled.sample_matrix(): #category_id = tsm_unlabeled.get_sample_category(sample_id) #print sample_id, category_id rn_sem(positive_category_id, tsm_positive, tsm_unlabeled, result_dir)
def do_refresh(corpus_dir, samples_name): corpus = Corpus(corpus_dir) samples = Samples(corpus, samples_name) samples.load() Fix(samples).refresh_content()