示例#1
0
文件: digger.py 项目: uukuguy/digger
def do_query_sample(corpus_dir, samples_name, sample_id):
    corpus = Corpus(corpus_dir)
    corpus.vocabulary.load()

    samples = Samples(corpus, samples_name)
    samples.load()
    samples.query_by_id(sample_id)
示例#2
0
文件: digger.py 项目: uukuguy/digger
def do_purge(corpus_dir, samples_name):
    corpus = Corpus(corpus_dir)

    samples = Samples(corpus, samples_name)
    samples.load()

    Fix(samples).purge()
示例#3
0
文件: digger.py 项目: uukuguy/digger
def do_rebuild_categories(corpus_dir, samples_name):
    corpus = Corpus(corpus_dir)
    samples = Samples(corpus, samples_name)
    samples.load()
    logging.debug(Logger.debug("Rebuild base data..."))
    Fix(samples).fix_categories()
    samples.rebuild_categories()
示例#4
0
文件: digger.py 项目: uukuguy/digger
def do_predict(corpus_dir, samples_name, model_name, result_dir):
    corpus = Corpus(corpus_dir)
    corpus.vocabulary.load()
    samples = Samples(corpus, samples_name)
    samples.load()
    logging.debug(Logger.debug("Predicting ..."))
    multicategories_predict(samples, model_name, result_dir)
示例#5
0
文件: digger.py 项目: uukuguy/digger
def do_query_categories(corpus_dir, samples_name, xls_file):
    corpus = Corpus(corpus_dir)

    samples = Samples(corpus, samples_name)
    samples.load()

    samples.query_categories(xls_file)
    logging.info(Logger.info("Query categories %s/<%s> Done. %s" % (corpus_dir, samples_name, xls_file)))
示例#6
0
文件: digger.py 项目: uukuguy/digger
def do_query_keywords(corpus_dir, samples_name, result_dir):
    corpus = Corpus(corpus_dir)

    samples = Samples(corpus, samples_name)
    samples.load()

    samples.show_category_keywords(result_dir)
    # samples.show_keywords_matrix()
    logging.info(Logger.info("Query keywords %s/<%s> Done. %s" % (corpus_dir, samples_name, result_dir)))
示例#7
0
def do_show(corpus_dir, samples_name):
    if (not corpus_dir) is None and (not samples_name is None):
        corpus = Corpus(corpus_dir)
        #corpus.vocabulary.load()

        samples = Samples(corpus, samples_name)
        samples.load()

        samples.show()
示例#8
0
文件: digger.py 项目: uukuguy/digger
def do_show(corpus_dir, samples_name):
    if corpus_dir is None or samples_name is None:
        return

    corpus = Corpus(corpus_dir)
    corpus.vocabulary.load()

    samples = Samples(corpus, samples_name)
    samples.load()

    samples.show()
示例#9
0
def do_query_sample_by_pu(corpus_dir, positive_name_list, unlabeled_name, sample_id):
    corpus = Corpus(corpus_dir)

    samples_positive = None
    for positive_name in positive_name_list:
        samples = Samples(corpus, positive_name)
        samples.load()
        if samples_positive is None:
            samples_positive = samples
        else:
            samples_positive.merge(samples)
            samples = None

    samples_unlabeled = Samples(corpus, unlabeled_name)
    samples_unlabeled.load()

    corpus.query_by_id(samples_positive, samples_unlabeled, sample_id)
示例#10
0
def do_iem(corpus_dir, positive_name, unlabeled_name, result_dir):
    corpus = Corpus(corpus_dir)
    corpus.vocabulary.load()
    samples_positive = Samples(corpus, positive_name)
    samples_positive.load()
    #samples_unlabeled = Samples(corpus, unlabeled_name)
    #samples_unlabeled.load()

    logging.debug(Logger.debug("I-EM ..."))

    positive_category_id = 4000000
    positive_ratio = 0.8
    tsm = samples_positive.tsm
    positive_samples_list, unlabeled_samples_list = tsm.crossvalidation_by_category_1(positive_category_id, positive_ratio, random = False)

    tsm_positive = tsm.clone(positive_samples_list)
    tsm_unlabeled = tsm.clone(unlabeled_samples_list)

    rn_iem(positive_category_id, tsm_positive, tsm_unlabeled, result_dir)
示例#11
0
文件: digger.py 项目: uukuguy/digger
def do_pulearning(corpus_dir, positive_name, unlabeled_name, result_dir):
    corpus = Corpus(corpus_dir)
    # corpus.vocabulary.load()

    samples_positive = Samples(corpus, positive_name)
    samples_positive.load()
    # samples_positive = None
    # for positive_name in positive_name_list:
    # samples = Samples(corpus, positive_name)
    # samples.load()
    # if samples_positive is None:
    # samples_positive = samples
    # else:
    # samples_positive.merge(samples)
    # samples = None

    samples_unlabeled = Samples(corpus, unlabeled_name)
    samples_unlabeled.load()

    PULearning_test(samples_positive, samples_unlabeled)
示例#12
0
def do_sem(corpus_dir, positive_name, unlabeled_name, result_dir):
    corpus = Corpus(corpus_dir)
    corpus.vocabulary.load()
    samples_positive = Samples(corpus, positive_name)
    samples_positive.load()
    #samples_unlabeled = Samples(corpus, unlabeled_name)
    #samples_unlabeled.load()

    logger.debug(Logger.debug("S-EM ..."))

    #positive_category_id = 1000000 # 供电服务
    #positive_category_id = 2000000 # 人资管理
    positive_category_id = 6000000 # 安全生产
    #positive_category_id = 6000000 # 党建作风
    #positive_category_id = 8000000 # 依法治企
    positive_ratio = 0.4
    negative_ratio = 0.66 # ratio of remaing samples. (1 - positive_ratio) * negative_ratio
    tsm = samples_positive.tsm
    #for sample_id in tsm.sample_matrix():
        #category_id = tsm.get_sample_category(sample_id)
        #print sample_id, category_id

    positive_samples_list, unlabeled_samples_list = tsm.crossvalidation_by_category_1(positive_category_id, positive_ratio, negative_ratio, positive_random = False, negative_random = False)

    tsm_positive = tsm.clone(positive_samples_list)
    tsm_unlabeled = tsm.clone(unlabeled_samples_list)

    #print positive_samples_list
    #print unlabeled_samples_list

    total_positive_samples = tsm_positive.get_total_samples()
    total_unlabeled_samples = tsm_unlabeled.get_total_samples()
    logging.debug(Logger.debug("do_sem() %d samples in tsm_positive, %d samples in tsm_unlabeled." % (total_positive_samples, total_unlabeled_samples)))
    #for sample_id in tsm_unlabeled.sample_matrix():
        #category_id = tsm_unlabeled.get_sample_category(sample_id)
        #print sample_id, category_id

    rn_sem(positive_category_id, tsm_positive, tsm_unlabeled, result_dir)
示例#13
0
文件: digger.py 项目: uukuguy/digger
def do_refresh(corpus_dir, samples_name):
    corpus = Corpus(corpus_dir)

    samples = Samples(corpus, samples_name)
    samples.load()
    Fix(samples).refresh_content()