Exemplo n.º 1
0
def read_labeled_text_data(file_path,
                           encoding,
                           save=False,
                           save_path=None,
                           flatten=False,
                           make_ft=True):
    f = codecs.open(file_path, 'r', encoding=encoding)
    orisents, nesents, sents = pre.read_labeled_file(f, flatten=flatten)
    X = []
    if make_ft == True:
        X = [pre.sent2features(s, flatten=flatten) for s in sents]
    y = [pre.sent2labels(s, flatten=flatten) for s in sents]

    f.close()

    if save is True:
        save_points = [10000, 50000, 200000, 1000000]
        for sp in save_points:
            st.write_log('writing : ' + str(sp) + '.pkl',
                         open=True,
                         close=True,
                         std_print=False)
            with open(save_path + str(sp) + '.pkl', 'wb') as output:
                pickle.dump(sents[:sp], output, pickle.HIGHEST_PROTOCOL)
                pickle.dump(X[:sp], output, pickle.HIGHEST_PROTOCOL)
                pickle.dump(y[:sp], output, pickle.HIGHEST_PROTOCOL)
    return orisents, nesents, sents, X, y
Exemplo n.º 2
0
 def out_act_in_golden(self, boot_iter):
     self.boot_iter = boot_iter
     if os.path.exists(self.active_dir) is False:
         os.makedirs(self.active_dir)
     utils.write_result_from_ft(self.X_bf_active, self.y_bf_active,
                                self.active_dir + str(boot_iter) + '.out')
     while True:
         print('ready?')
         ready = raw_input()
         active_in_fn = self.active_dir + str(boot_iter) + '.in'
         if st.ACTIVE_DEBUG == True:
             active_in_fn = self.active_dir + str(boot_iter) + '.out'
         if os.path.exists(active_in_fn):
             act_sents, X_golden, y_golden = utils.read_labeled_text_data(
                 active_in_fn, encoding=st.ENCODING, flatten=st.FLATTEN)
             st.write_log(str(len(act_sents)) +
                          ' sents are read from active data.',
                          open=True,
                          close=True,
                          std_print=True)
             y_golden, _, _ = utils._remove_all_o(y_golden, [], [])
             break
     print(active_in_fn)
     yprob_golden = utils.generate_all(y_golden, 1)
     if st.ACTIVE_DEBUG == True:
         idx = 0
         for x_b, x_a in zip(self.X_bf_active, X_golden):
             idx += 1
             for x_ft_B, x_ft_A in zip(x_b, x_a):
                 if tuple(x_ft_A) != tuple(x_ft_B):
                     print('error!!!')
     self.X_bf_active = []
     self.y_bf_active = []
     self.active_put_count = 0
     return X_golden, y_golden, yprob_golden
Exemplo n.º 3
0
 def _add_n_train_CRF(self, X, y, yprob=None, clear=False):
     this_model_name = self.model_name + str(self.iter)
     added_cnt = 0
     X_added = []
     y_added = []
     yprob_added = []
     if yprob != None:
         for xseq, yseq, yseqprob in zip(X, y, yprob):
             if len(yseq) != 0:
                 added_cnt += 1
                 X_added.append(xseq)
                 y_added.append(yseq)
                 yprob_added.append(yseqprob)
                 self.trainer.append(xseq, yseq)
     else:
         for xseq, yseq in zip(X, y):
             if len(yseq) != 0:
                 added_cnt += 1
                 X_added.append(xseq)
                 y_added.append(yseq)
                 self.trainer.append(xseq, yseq)
     if st.DICTIONARY is True:
         self.X_total += X_added  #added for dic
         self.y_total += y_added
     #st.write_log('added : ' + str(added_cnt) + '/' + str(len(y)), open=True,close=True, std_print=False)
     if self.save_path is not None:
         st.write_log('writing added data file',
                      open=True,
                      close=True,
                      std_print=False)
         if os.path.isfile(self.save_path + str(self.iter) + '.pkl'):
             save_path = self.save_path + 'R'
         else:
             save_path = self.save_path
         with open(save_path + str(self.iter) + '.pkl', 'wb') as output:
             pickle.dump(X_added, output, pickle.HIGHEST_PROTOCOL)
             pickle.dump(y_added, output, pickle.HIGHEST_PROTOCOL)
             if yprob != None:
                 pickle.dump(yprob_added, output, pickle.HIGHEST_PROTOCOL)
     if clear == True and self.iter > 0:
         os.remove(self.model_path + self.model_name + str(self.iter - 1) +
                   '.crfsuite')
     self.trainer.set_params({
         'c1': 0.0,  # coefficient for L1 penalty
         'c2': 0.0,  # coefficient for L2 penalty
         'max_iterations': st.CRF_ITER,  # stop earlier ****
         # include transitions that are possible, but not observed
         'feature.possible_transitions': True
     })
     self.trainer.params()
     self.trainer.train(self.model_path + this_model_name +
                        '.crfsuite')  # model save
     self.trainer.logparser.last_iteration  # ???
     self.iter += 1
Exemplo n.º 4
0
 def eval_prediction(self, ypred, tag_conf_table=False, log=True):
     out_str = 'evaluation\n'
     cor, pred, ans = self.eval(ypred)
     if tag_conf_table == True:
         for tag in st.TAG:
             self._draw_confusion_table(ypred, tag)
     precision = float(cor) / pred
     recall = float(cor) / ans
     f1score = 2.0 * (precision * recall) / (precision + recall)
     out_str += 'NEs : ' + str(pred) + ', precision : ' + str(
         precision) + ', recall : ' + str(recall) + ', f1score : ' + str(
             f1score) + '\n'
     if log == True:
         st.write_log(out_str, open=True, close=True, std_print=True)
     else:
         print(out_str)
     return f1score
Exemplo n.º 5
0
 def put_act_n_get_remain(self, X_auto, y_auto, y_mar_prob):
     X_selected, y_selected, yprob_selected, X_remain, y_remain, yprob_remain = self._select_in_range(
         X_auto, y_auto, y_mar_prob)
     st.write_log(str(len(X_selected)) + ' sents are added to active data.',
                  open=True,
                  close=True,
                  std_print=True)
     #write file(str(boot_iter) + '_' + str(self.active_put_count)+'.pkl)
     #if os.paty.size >50 , name = get, boot_names = for name.split(_),, remove_start_with( boot_name.int.sort.getfirst)
     self._save(X_selected, y_selected)
     self.X_bf_active += X_selected
     self.y_bf_active += y_selected
     self.active_put_count += 1
     #debug
     #X_remain = X_auto
     #y_remain = y_auto
     #yprob_remain = y_mar_prob
     return X_remain, y_remain, yprob_remain
Exemplo n.º 6
0
 def write_ML_data(self, orisents, nesents, X_auto, y_auto, y_mar_prob):
     orisents_selected, nesents_selected, X_selected, y_selected, yprob_selected, orisents_remain, nesents_remain, X_remain, y_remain, yprob_remain = self._select_in_range(
         orisents, nesents, X_auto, y_auto, y_mar_prob)
     # orisents_remain, nesents_remain, X_remain, y_remain, yprob_remain remove all o
     st.write_log(str(len(X_selected)) + ':' + str(len(X_remain)) +
                  ' = active : good',
                  open=True,
                  close=True,
                  std_print=True)
     if st.PROB_OUT == True:
         print('writing : ' + self.active_dir + str(self.boot_iter) +
               '_active' + '.txt')
         utils.write_result_from_ft(orisents_selected,
                                    nesents_selected,
                                    X_selected,
                                    y_selected,
                                    self.active_dir + str(self.boot_iter) +
                                    '_active' + '.txt',
                                    yprob=yprob_selected)
         print('writing : ' + self.good_dir + str(self.boot_iter) +
               '_good' + '.txt')
         utils.write_result_from_ft(orisents_remain,
                                    nesents_remain,
                                    X_remain,
                                    y_remain,
                                    self.good_dir + str(self.boot_iter) +
                                    '_good' + '.txt',
                                    yprob=yprob_remain)
     else:
         print('writing : ' + self.active_dir + str(self.boot_iter) +
               '_active' + '.txt')
         utils.write_result_from_ft(
             orisents_selected, nesents_selected, X_selected, y_selected,
             self.active_dir + str(self.boot_iter) + '_active' + '.txt')
         print('writing : ' + self.good_dir + str(self.boot_iter) +
               '_good' + '.txt')
         utils.write_result_from_ft(
             orisents_remain, nesents_remain, X_remain, y_remain,
             self.good_dir + str(self.boot_iter) + '_good' + '.txt')
     return self.active_dir + str(
         self.boot_iter) + '_active' + '.txt', self.good_dir + str(
             self.boot_iter) + '_good' + '.txt'
Exemplo n.º 7
0
def main():
    summary = '=========================== summary ============================\n'
    if not os.path.exists(st.MODEL_DIR + MODEL_NUMBER):
        os.makedirs(st.MODEL_DIR + MODEL_NUMBER)
    #st.print_setting()
    print_name()
    st.write_log('Reading files\n', open=True, close=True)
    _,_,test_sents, X_test, y_test = \
        utils.read_labeled_text_data_dir(st.TEST_DIR, encoding=st.ENCODING)##20170912
    _,_,train_sents, X_train, y_train = \
        utils.read_labeled_text_data_dir(st.TRAIN_DIR, encoding=st.ENCODING)##20170912
    _,_,act_sents, X_act, y_act = \
        utils.read_labeled_text_data_dir(st.ACT_DIR, encoding=st.ENCODING)
    _,_,good_sents, X_good, y_good = \
        utils.read_labeled_text_data_dir(st.GOOD_ML_DATA_DIR, encoding=st.ENCODING)
    if len(FULL_IN_FILENAME) == 0:
        unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \
        y_unlabeled = utils.read_labeled_text_data_dir(st.UNLABELED_DIR, encoding=st.ENCODING)
    else:
        unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \
        y_unlabeled = utils.read_labeled_text_data(FULL_IN_FILENAME, encoding=st.ENCODING)
    if len(unlabeled_orisents) != len(unlabeled_sents):
        print 'error! check the unlabeled input file!'

    X_basiccrf = X_train + X_act + X_good
    y_basiccrf = y_train + y_act + y_good

    if len(test_sents) > 1:
        tester = myTagger(X_test=X_test, y_test=y_test, test_sents=test_sents)
    basic_CRF = BasicModel(BASE_MODEL_PATH, BASE_LINE_NAME)
    bagging_model = BaggingModel(BAGGING_MODEL_PATH,
                                 BAGGING_MODEL_NAME,
                                 num_of_comp_mds=st.NUM_BAGGING_MODEL,
                                 boot_sample_size=st.BOOTSTRAP_SAMPLE_SIZE /
                                 st.NUM_BAGGING_MODEL,
                                 X_labeled=X_train,
                                 y_labeled=y_train)

    st.write_log('Training Basic CRF', close=True, std_print=True)
    unlabeled_orisents_now =\
        unlabeled_orisents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    unlabeled_nesents_now =\
        unlabeled_nesents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    X_unlabeled_now =\
        X_unlabeled[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE]
    y_basiccrf, _, _ = utils._remove_all_o(y_basiccrf, [], [])
    if st.SELF_ITER_N == 1:
        basic_CRF._add_n_train_CRF(X_basiccrf, y_basiccrf)
    else:
        basic_CRF.add_n_train_CRF(X_basiccrf,
                                  y_basiccrf,
                                  clear_past_model=True,
                                  add_total=True,
                                  write_added=True)
    if st.ALL_TEST is True:
        tester.eval_prediction(
            basic_CRF.make_prediction(tester.X_test, remove_all_o=False)[0])
    if GENERATE_FULL is True:
        anm, a, m = generate(act_sents, y_act, unlabeled_orisents,
                             unlabeled_nesents, unlabeled_sents, X_unlabeled,
                             basic_CRF)
        summary += '=generated full data\n' + '=' + anm + '\n' + '=' + a + '\n' + '=' + m + '\n'
        #y_pred_u_full, y_mar_p_u_full = basic_CRF.make_prediction(X_unlabeled, remove_all_o= False, min_conf= -1.0, link_pos=None)
    if NON_ACTIVE == False:
        y_pred_u =\
            basic_CRF.make_prediction(X_unlabeled_now, remove_all_o=True, min_conf=st.FIXED_MIN_SEQ_PROB,link_pos=None)[0]
        print 'Training Bagging CRF'
        bagging_model.set_selflabeled_data_n_train(X_unlabeled_now, y_pred_u)
        if st.ALL_TEST is True:
            y_pred_test, _ = bagging_model.make_prediction(tester.X_test,
                                                           remove_all_o=False)
            tester.eval_prediction(y_pred_test)

        X_basiccrf = X_unlabeled_now
        if st.SELF_ITER_N == 1:
            y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                X_basiccrf,
                remove_all_o=False,
                min_conf=st.FIXED_MIN_MARGINAL_PROB,
                link_pos=None,
                replace_o=False,
                mul_ne_cnt=False)

        elif st.SELF_ITER_N > 1:
            y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                X_basiccrf,
                remove_all_o=True,
                min_conf=st.FIXED_MIN_MARGINAL_PROB,
                link_pos=None,
                replace_o=False,
                mul_ne_cnt=False)
            for boot_sub_iter in range(1, st.SELF_ITER_N):
                st.write_log('sub-iter : ' + str(boot_sub_iter) + '/' +
                             str(st.SELF_ITER_N - 1),
                             open=True,
                             std_print=True)
                print 'sub : Training Basic CRF'
                basic_CRF.temp_add_n_train_CRF(X_basiccrf, y_basiccrf,
                                               boot_sub_iter)
                if st.ALL_TEST is True:
                    tester.eval_prediction(
                        basic_CRF.make_prediction(tester.X_test,
                                                  remove_all_o=False)[0])
                y_pred_u = basic_CRF.make_prediction(
                    X_unlabeled_now,
                    remove_all_o=True,
                    min_conf=st.FIXED_MIN_SEQ_PROB,
                    link_pos=None)[0]
                print 'sub : Training Bagging CRF'
                bagging_model.set_selflabeled_data_n_train(
                    X_unlabeled_now, y_pred_u)
                if st.ALL_TEST is True:
                    y_pred_test, _ = bagging_model.make_prediction(
                        tester.X_test, remove_all_o=False)
                    tester.eval_prediction(y_pred_test)
                X_basiccrf = X_unlabeled_now
                if boot_sub_iter < st.SELF_ITER_N - 1:
                    y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                        X_basiccrf,
                        remove_all_o=True,
                        min_conf=st.FIXED_MIN_MARGINAL_PROB,
                        link_pos=None,
                        replace_o=False,
                        mul_ne_cnt=False)  #remove all o false
                elif boot_sub_iter == st.SELF_ITER_N - 1:
                    y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction(
                        X_basiccrf,
                        remove_all_o=False,
                        min_conf=st.FIXED_MIN_MARGINAL_PROB,
                        link_pos=None,
                        replace_o=False,
                        mul_ne_cnt=False)  #remove all o false

        #print out file & remove model dir
        active_assistant = Active_Assistant(st.active_min_prob,
                                            st.active_max_prob,
                                            st.GOOD_ML_DATA_DIR_OUT,
                                            st.ACT_DIR_OUT, BOOT_ITER)
        a, g = active_assistant.write_ML_data(unlabeled_orisents_now,
                                              unlabeled_nesents_now,
                                              X_basiccrf, y_basiccrf,
                                              y_basiccrf_mar_p)

        summary += '=generated bootstrapping data\n' + '=' + a + '\n' + '=' + g + '\n'
    if SAVE_MODEL == False:
        shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/')
    else:
        shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/bagging_model/')
        if NON_ACTIVE != True:
            if st.SELF_ITER_N > 1:
                basic_CRF.remove_latest_model()
        summary += '=generated model\n' + '=' + basic_CRF.get_first_model_name(
        ) + '\n'
    summary += '================================================================\n'
    print summary