Пример #1
0
 def out_act_in_golden(self, boot_iter):
     self.boot_iter = boot_iter
     if os.path.exists(self.active_dir) is False:
         os.makedirs(self.active_dir)
     utils.write_result_from_ft(self.X_bf_active, self.y_bf_active,
                                self.active_dir + str(boot_iter) + '.out')
     while True:
         print('ready?')
         ready = raw_input()
         active_in_fn = self.active_dir + str(boot_iter) + '.in'
         if st.ACTIVE_DEBUG == True:
             active_in_fn = self.active_dir + str(boot_iter) + '.out'
         if os.path.exists(active_in_fn):
             act_sents, X_golden, y_golden = utils.read_labeled_text_data(
                 active_in_fn, encoding=st.ENCODING, flatten=st.FLATTEN)
             st.write_log(str(len(act_sents)) +
                          ' sents are read from active data.',
                          open=True,
                          close=True,
                          std_print=True)
             y_golden, _, _ = utils._remove_all_o(y_golden, [], [])
             break
     print(active_in_fn)
     yprob_golden = utils.generate_all(y_golden, 1)
     if st.ACTIVE_DEBUG == True:
         idx = 0
         for x_b, x_a in zip(self.X_bf_active, X_golden):
             idx += 1
             for x_ft_B, x_ft_A in zip(x_b, x_a):
                 if tuple(x_ft_A) != tuple(x_ft_B):
                     print('error!!!')
     self.X_bf_active = []
     self.y_bf_active = []
     self.active_put_count = 0
     return X_golden, y_golden, yprob_golden
def main() :
    if not os.path.exists('../'+ MODEL_NUMBER):
        os.makedirs('../'+ MODEL_NUMBER)
    if st.RELOAD == True :
        print('---------reload---------')
    st.print_setting()
    print_name()
    print ('Reading files')
    test_sents, X_test, y_test = utils.read_labeled_text_data('../data/' + st.TEST_FILE, encoding=st.ENCODING)##20170912
    train_sents, X_train, y_train = utils.read_labeled_text_data('../data/' + st.TRAIN_FILE, encoding=st.ENCODING)##20170912
    unlabeled_sents, X_unlabeled, y_unlabeled = utils.read_labeled_text_data('../data/' + st.UNLABELED_FILE, encoding=st.ENCODING)
    #unlabeled_sents, X_unlabeled, y_unlabeled, _ = utils.read_labeled_pickle_data('../data/' + st.UNLABELED_FILE) ##20170912

    if st.RELOAD is False :
        X_basiccrf = X_train
        y_basiccrf = y_train
        y_basiccrf_mar_p = utils.generate_all(y_basiccrf, 1)
    else :
        loader = Loader(BASE_DATA_SAVE_PATH, LOAD_MODEL_N)
        X_basiccrf = loader.get_X()
        y_basiccrf = loader.get_y()
        y_basiccrf_mar_p = loader.get_yprob()

    X_unlabeled_devied = utils.split_set_w_size(X_unlabeled,st.BOOTSTRAP_SAMPLE_SIZE,st.BOOT_ITER_LIMIT)
    tester = myTagger(X_test=X_test, y_test=y_test, test_sents=test_sents)
    basic_CRF = BasicModel(BASE_MODEL_PATH, BASE_LINE_NAME, BASE_DATA_SAVE_PATH, st.START_ITER)
    bagging_model = BaggingModel(BAGGING_MODEL_PATH, BAGGING_MODEL_NAME, num_of_comp_mds=st.NUM_BAGGING_MODEL,
                                 boot_sample_size=st.BOOTSTRAP_SAMPLE_SIZE / st.NUM_BAGGING_MODEL, X_labeled=X_train,
                                 y_labeled=y_train, save_path=None, start_iter = st.START_ITER)


    for boot_iter in range(st.START_ITER, len(X_unlabeled_devied)):
        print ('boot : ' + str(boot_iter) + '/' + str(len(X_unlabeled_devied)))
        print('Training Basic CRF')

        X_unlabeled_now = X_unlabeled_devied[boot_iter]

        basic_CRF.add_n_train_CRF(X_basiccrf, y_basiccrf,y_basiccrf_mar_p)

        tester.eval_prediction(basic_CRF.make_prediction(tester.X_test,remove_all_o = False)[0])
        y_pred_u = basic_CRF.make_prediction(X_unlabeled_now, remove_all_o=st.REMOVE_ALL_ZERO, min_conf=st.FIXED_MIN_SEQ_PROB)[0]

        if st.BAGGING is False :
            X_basiccrf = X_unlabeled_now
            y_basiccrf = y_pred_u
            y_basiccrf_mar_p = None
            continue
        print('Training Bagging CRFs')
        bagging_model.set_selflabeled_data_n_train(X_unlabeled_now, y_pred_u)
        y_pred_test, _ = bagging_model.make_prediction(tester.X_test, remove_all_o=False)
        tester.eval_prediction(y_pred_test)
        X_basiccrf = X_unlabeled_now
        y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(X_basiccrf,
                                                                     remove_all_o=st.REMOVE_ALL_ZERO,
                                                                     min_conf=st.FIXED_MIN_MARGINAL_PROB if st.VOTE_ON_DIST is True else st.FIXED_MIN_PERC_OF_VOTES)

    boot_iter += 1
    print('boot : ' + str(boot_iter) + '/' + str(len(X_unlabeled_devied)))
    print('Training Basic CRF')

    basic_CRF.add_n_train_CRF(X_basiccrf, y_basiccrf, y_basiccrf_mar_p)

    tester.eval_prediction(basic_CRF.make_prediction(tester.X_test, remove_all_o = False)[0])
Пример #3
0
def main():
    summary = '=========================== summary ============================\n'
    if not os.path.exists(st.MODEL_DIR + MODEL_NUMBER):
        os.makedirs(st.MODEL_DIR + MODEL_NUMBER)
    #st.print_setting()
    print_name()
    st.write_log('Reading files\n', open=True, close=True)
    _,_,test_sents, X_test, y_test = \
        utils.read_labeled_text_data_dir(st.TEST_DIR, encoding=st.ENCODING)##20170912
    _,_,train_sents, X_train, y_train = \
        utils.read_labeled_text_data_dir(st.TRAIN_DIR, encoding=st.ENCODING)##20170912
    _,_,act_sents, X_act, y_act = \
        utils.read_labeled_text_data_dir(st.ACT_DIR, encoding=st.ENCODING)
    _,_,good_sents, X_good, y_good = \
        utils.read_labeled_text_data_dir(st.GOOD_ML_DATA_DIR, encoding=st.ENCODING)
    if len(FULL_IN_FILENAME) == 0:
        unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \
        y_unlabeled = utils.read_labeled_text_data_dir(st.UNLABELED_DIR, encoding=st.ENCODING)
    else:
        unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \
        y_unlabeled = utils.read_labeled_text_data(FULL_IN_FILENAME, encoding=st.ENCODING)
    if len(unlabeled_orisents) != len(unlabeled_sents):
        print 'error! check the unlabeled input file!'

    X_basiccrf = X_train + X_act + X_good
    y_basiccrf = y_train + y_act + y_good

    if len(test_sents) > 1:
        tester = myTagger(X_test=X_test, y_test=y_test, test_sents=test_sents)
    basic_CRF = BasicModel(BASE_MODEL_PATH, BASE_LINE_NAME)
    bagging_model = BaggingModel(BAGGING_MODEL_PATH,
                                 BAGGING_MODEL_NAME,
                                 num_of_comp_mds=st.NUM_BAGGING_MODEL,
                                 boot_sample_size=st.BOOTSTRAP_SAMPLE_SIZE /
                                 st.NUM_BAGGING_MODEL,
                                 X_labeled=X_train,
                                 y_labeled=y_train)

    st.write_log('Training Basic CRF', close=True, std_print=True)
    unlabeled_orisents_now =\
        unlabeled_orisents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    unlabeled_nesents_now =\
        unlabeled_nesents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    X_unlabeled_now =\
        X_unlabeled[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    y_basiccrf, _, _ = utils._remove_all_o(y_basiccrf, [], [])
    if st.SELF_ITER_N == 1:
        basic_CRF._add_n_train_CRF(X_basiccrf, y_basiccrf)
    else:
        basic_CRF.add_n_train_CRF(X_basiccrf,
                                  y_basiccrf,
                                  clear_past_model=True,
                                  add_total=True,
                                  write_added=True)
    if st.ALL_TEST is True:
        tester.eval_prediction(
            basic_CRF.make_prediction(tester.X_test, remove_all_o=False)[0])
    if GENERATE_FULL is True:
        anm, a, m = generate(act_sents, y_act, unlabeled_orisents,
                             unlabeled_nesents, unlabeled_sents, X_unlabeled,
                             basic_CRF)
        summary += '=generated full data\n' + '=' + anm + '\n' + '=' + a + '\n' + '=' + m + '\n'
        #y_pred_u_full, y_mar_p_u_full = basic_CRF.make_prediction(X_unlabeled, remove_all_o= False, min_conf= -1.0, link_pos=None)
    if NON_ACTIVE == False:
        y_pred_u =\
            basic_CRF.make_prediction(X_unlabeled_now, remove_all_o=True, min_conf=st.FIXED_MIN_SEQ_PROB,link_pos=None)[0]
        print 'Training Bagging CRF'
        bagging_model.set_selflabeled_data_n_train(X_unlabeled_now, y_pred_u)
        if st.ALL_TEST is True:
            y_pred_test, _ = bagging_model.make_prediction(tester.X_test,
                                                           remove_all_o=False)
            tester.eval_prediction(y_pred_test)

        X_basiccrf = X_unlabeled_now
        if st.SELF_ITER_N == 1:
            y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                X_basiccrf,
                remove_all_o=False,
                min_conf=st.FIXED_MIN_MARGINAL_PROB,
                link_pos=None,
                replace_o=False,
                mul_ne_cnt=False)

        elif st.SELF_ITER_N > 1:
            y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                X_basiccrf,
                remove_all_o=True,
                min_conf=st.FIXED_MIN_MARGINAL_PROB,
                link_pos=None,
                replace_o=False,
                mul_ne_cnt=False)
            for boot_sub_iter in range(1, st.SELF_ITER_N):
                st.write_log('sub-iter : ' + str(boot_sub_iter) + '/' +
                             str(st.SELF_ITER_N - 1),
                             open=True,
                             std_print=True)
                print 'sub : Training Basic CRF'
                basic_CRF.temp_add_n_train_CRF(X_basiccrf, y_basiccrf,
                                               boot_sub_iter)
                if st.ALL_TEST is True:
                    tester.eval_prediction(
                        basic_CRF.make_prediction(tester.X_test,
                                                  remove_all_o=False)[0])
                y_pred_u = basic_CRF.make_prediction(
                    X_unlabeled_now,
                    remove_all_o=True,
                    min_conf=st.FIXED_MIN_SEQ_PROB,
                    link_pos=None)[0]
                print 'sub : Training Bagging CRF'
                bagging_model.set_selflabeled_data_n_train(
                    X_unlabeled_now, y_pred_u)
                if st.ALL_TEST is True:
                    y_pred_test, _ = bagging_model.make_prediction(
                        tester.X_test, remove_all_o=False)
                    tester.eval_prediction(y_pred_test)
                X_basiccrf = X_unlabeled_now
                if boot_sub_iter < st.SELF_ITER_N - 1:
                    y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                        X_basiccrf,
                        remove_all_o=True,
                        min_conf=st.FIXED_MIN_MARGINAL_PROB,
                        link_pos=None,
                        replace_o=False,
                        mul_ne_cnt=False)  #remove all o false
                elif boot_sub_iter == st.SELF_ITER_N - 1:
                    y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                        X_basiccrf,
                        remove_all_o=False,
                        min_conf=st.FIXED_MIN_MARGINAL_PROB,
                        link_pos=None,
                        replace_o=False,
                        mul_ne_cnt=False)  #remove all o false

        #print out file & remove model dir
        active_assistant = Active_Assistant(st.active_min_prob,
                                            st.active_max_prob,
                                            st.GOOD_ML_DATA_DIR_OUT,
                                            st.ACT_DIR_OUT, BOOT_ITER)
        a, g = active_assistant.write_ML_data(unlabeled_orisents_now,
                                              unlabeled_nesents_now,
                                              X_basiccrf, y_basiccrf,
                                              y_basiccrf_mar_p)

        summary += '=generated bootstrapping data\n' + '=' + a + '\n' + '=' + g + '\n'
    if SAVE_MODEL == False:
        shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/')
    else:
        shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/bagging_model/')
        if NON_ACTIVE != True:
            if st.SELF_ITER_N > 1:
                basic_CRF.remove_latest_model()
        summary += '=generated model\n' + '=' + basic_CRF.get_first_model_name(
        ) + '\n'
    summary += '================================================================\n'
    print summary