Python collect_data_from_tsv 예제들, utils.collect_data_from_tsv Python 예제들

예제 #1

0

파일 보기

파일: evaluation.py 프로젝트: jiangnanqi/MATEPC

 def evaluate(self, pred_file, gold_file=""):
     sents, labels, pred_labels = collect_data_from_tsv(pred_file)
     p, r, f1, p_head, r_head, f1_head = self.f1_score(labels, pred_labels)
     return {
         "precision": "{:.2f}".format(p),
         "recall": "{:.2f}".format(r),
         "f1-score": "{:.2f}".format(f1),
         "precision-head": "{:.2f}".format(p_head),
         "recall-head": "{:.2f}".format(r_head),
         "f1-score-head": "{:.2f}".format(f1_head)
     }

예제 #2

0

파일 보기

파일: evaluation.py 프로젝트: giahy2507/atepc

    def convert(self, tsvfile, **kwargs):

        if kwargs is not None:
            semfile = kwargs["semfile"]
        else:
            raise ("[!] semfile not found")

        if os.path.isfile(tsvfile) == False:
            raise ("[!] Data %s not found" % tsvfile)
        if os.path.isfile(semfile) == False:
            raise ("[!] Data %s not found" % semfile)

        # Collect sentences in tsv file
        sents, labels, pred_labels = collect_data_from_tsv(tsvfile)

        sent_idx = 0
        conflict_sent = [
            '912:1', '799:1', '1027:1', '2:1', '231:1', '463:26', '1008:1',
            '762:1', '786:1301', '786:1054', '416:1', '347:1', '11:1', '29:1',
            '32894246#870052#0', '33070309#423221#1', '32464601#418474#1',
            '11350539#680470#4', '11302357#835238#2', '35177381#521555#3',
            '11351451#805713#4', '11513049#499488#8', '33072753#1351349#2',
            '32936760#1397861#8', '32896473#439063#0', '32464601#418474#0',
            '11432442#650772#2', '11313290#1139539#1'
        ]

        # Collect sentences in semfile
        tree = ET.parse(semfile)
        root = tree.getroot()
        for sentence_tag in root.findall('sentence'):
            if sentence_tag.get("id") in conflict_sent:
                root.remove(sentence_tag)
                continue

            text = sentence_tag.find('text').text
            words, preds = sents[sent_idx], pred_labels[sent_idx]
            aspectterms = get_aspecterm(words, preds)
            aspectterms_tag = ET.Element("aspectTerms")

            for aspectterm in aspectterms:
                aspectterm_str_origin, from_idx, to_idx = self.gen_aspecterm_positions(
                    aspectterm,
                    sentence_tag.find('text').text)
                aspectterm_tag = ET.Element("aspectTerm")
                aspectterm_tag.set("term", aspectterm_str_origin)
                aspectterm_tag.set("from", str(from_idx))
                aspectterm_tag.set("to", str(to_idx))
                aspectterms_tag.append(aspectterm_tag)
            sentence_tag.append(aspectterms_tag)
            sent_idx += 1
        fo_path = os.path.join(os.getcwd(), "data/evaluation/ate_pred.xml")
        tree.write(fo_path, encoding="utf-8")
        return fo_path

예제 #3

0

파일 보기

파일: evaluation.py 프로젝트: jiangnanqi/MATEPC

 def evaluate(self, pred_file, gold_file=""):
     senti_gold = []
     senti_pred = []
     sents, labels, pred_labels = collect_data_from_tsv(pred_file)
     for sent, label, pred in zip(sents, labels, pred_labels):
         sub_senti_gold, sub_senti_pred = self.evaluate_sentence(
             sent, label, pred)
         senti_gold += sub_senti_gold
         senti_pred += sub_senti_pred
     senti_gold = np.array(senti_gold, dtype=np.int32)
     senti_pred = np.array(senti_pred, dtype=np.int32)
     accuracy = float(sum(senti_gold == senti_pred)) / len(senti_gold)
     return {"accuracy": "{0:.2f}".format(accuracy * 100)}

예제 #4

0

파일 보기

파일: evaluation.py 프로젝트: giahy2507/atepc

    def convert(self, filename, **kwargs):
        sents, labels, pred_labels = collect_data_from_tsv(filename)
        fo_path = filename[:-4] + ".ATE.tsv"
        if kwargs is not None:
            if "fo_path" in kwargs:
                fo_path = kwargs["fo_path"]

        fo = open(fo_path, mode="w")
        for words, tags, preds in zip(sents, labels, pred_labels):
            for word, tag, pred in zip(words, tags, preds):
                fo.write("{0}\t{1}\t{2}\n".format(word,
                                                  self.convert_label(tag),
                                                  self.convert_label(pred)))
            fo.write("\n")
        fo.close()
        return fo_path

예제 #5

0

파일 보기

파일: evaluation.py 프로젝트: giahy2507/atepc

 def evaluate(self, pred_file, gold_file=""):
     sentis_gold = []
     sentis_pred = []
     no_incorrect_ate = 0
     no_correct_ate = 0
     sents, labels, pred_labels = collect_data_from_tsv(pred_file)
     for sent, label, pred in zip(sents, labels, pred_labels):
         aspecterms, aspect_labels_gold, aspect_labels_pred = self.get_pred_infor(
             sent, label, pred)
         for aspectterm, aspect_label_gold, aspect_label_pred in zip(
                 aspecterms, aspect_labels_gold, aspect_labels_pred):
             is_correct_ate = self.compare_2_lists(aspect_label_gold,
                                                   aspect_label_pred)
             if is_correct_ate is False:
                 no_incorrect_ate += 1
                 continue
             else:
                 no_correct_ate += 1
                 senti_gold = self.BIO2Sentiment(
                     [label.split("-")[-1] for label in aspect_label_gold])
                 senti_pred = self.BIO2Sentiment(
                     [label.split("-")[-1] for label in aspect_label_pred])
                 sentis_gold.append(senti_gold)
                 sentis_pred.append(senti_pred)
     sentis_gold = np.array(sentis_gold, dtype=np.int32)
     sentis_pred = np.array(sentis_pred, dtype=np.int32)
     accuracy = float(sum(sentis_gold == sentis_pred)) / len(sentis_gold)
     return {
         "accuracy":
         "{0:.2f}".format(accuracy * 100),
         "no_incorrect_ate":
         no_incorrect_ate,
         "no_correct_ate":
         no_correct_ate,
         "ate_correct_rate":
         "{0:.2f}".format((float(no_correct_ate) /
                           (no_correct_ate + no_incorrect_ate)) * 100)
     }

예제 #6

0

파일 보기

파일: testanago.py 프로젝트: giahy2507/atepc

def test_anago(keras_model_name="WCP",
               hand_features=None,
               task_name="ATEPC2",
               data_name="laptops"):
    DATA_ROOT = 'data'
    SAVE_ROOT = './models'  # trained models
    LOG_ROOT = './logs'  # checkpoint, tensorboard
    w_embedding_path = '/home/s1610434/Documents/Data/Vector/glove.twitter.27B.100d.txt'
    c_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.char.100.txt'
    pos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.pos.100.txt'
    unipos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.unipos.100.txt'

    model_config = prepare_modelconfig(keras_model_name)
    training_config = TrainingConfig()
    training_config.max_epoch = 100
    training_config.early_stopping = 20

    print("-----{0}-----{1}-----{2}-----{3}-----".format(
        task_name, data_name, keras_model_name, hand_features))
    save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name)
    train_path = os.path.join(DATA_ROOT,
                              '{0}.{1}.train.tsv'.format(data_name, task_name))
    test_path = os.path.join(DATA_ROOT,
                             '{0}.{1}.test.tsv'.format(data_name, task_name))
    train_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name))
    test_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name))

    # train set
    x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path)
    x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path)

    # test set
    X_test, Y_test, _ = collect_data_from_tsv(test_path)
    X_test_dep = collect_dept_data_from_tsv(test_dep_path)

    # TODO Kfold split
    kf = KFold(n_splits=10)
    i_fold = 0
    results = []
    atepc_evaluator = ATEPCEvaluator()
    for train_index, valid_index in kf.split(x_train_valid):
        model_name = "{0}.{1}.{2}".format(keras_model_name,
                                          "{0}".format(hand_features), i_fold)
        X_train, X_valid = x_train_valid[train_index], x_train_valid[
            valid_index]
        X_train_dep, X_valid_dep = x_train_valid_dep[
            train_index], x_train_valid_dep[valid_index]
        Y_train, Y_valid = y_train_valid[train_index], y_train_valid[
            valid_index]

        print("Data train: ", X_train.shape, Y_train.shape)
        print("Data valid: ", X_valid.shape, Y_valid.shape)
        print("Data  test: ", X_test.shape, Y_test.shape)

        p = prepare_preprocessor(list(zip(X_train, X_train_dep)),
                                 Y_train,
                                 keras_model_name=keras_model_name,
                                 hand_features=hand_features)
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)
        if keras_model_name.find("P") != -1:
            if hand_features is not None:
                if "UNIPOS" in hand_features:
                    pos_embedding_path = unipos_embedding_path
            model_config.pos_vocab_size = len(p.pos_extractor.features_dict)
        if keras_model_name.find("H") != -1:
            # model_config.hand_feature_size = gen_no_hand_dimension(data_name, hand_features, keras_model_name)
            model_config.hand_feature_size = 53
            print("model_config.hand_feature_size: ",
                  str(model_config.hand_feature_size))

        filepath = os.path.join(save_path, model_name)
        if os.path.isfile(filepath) is False:
            continue

        evaluator = anago.Evaluator(model_config,
                                    weights=model_name,
                                    save_path=save_path,
                                    preprocessor=p,
                                    keras_model_name=keras_model_name)
        print("--- Test phrase --- " + model_name)
        # print("Train ")
        # f1_score_train = evaluator.eval(list(zip(X_train, X_train_dep)), Y_train)
        # print("Validation ")
        # f1_score_valid = evaluator.eval(list(zip(X_valid, X_valid_dep)), Y_valid)
        # print("Test ")
        f1_score_test = evaluator.eval(list(zip(X_test, X_test_dep)), Y_test)
        print("---")
        i_fold += 1

        # Kfold cross validation
        f_out_name = "data/{0}.{1}.test.pred.tsv".format(data_name, task_name)
        f_out = open(f_out_name, "w")
        ## Tagging
        tagger = anago.Tagger(model_config,
                              model_name,
                              save_path=save_path,
                              preprocessor=p,
                              keras_model_name=keras_model_name)
        for x, y in zip(list(zip(X_test, X_test_dep)), Y_test):
            result = tagger.predict(x)
            for word, label, pred in zip(x[0], y, result):
                f_out.write("{0}\t{1}\t{2}\n".format(word, label, pred))
            f_out.write("\n")
        f_out.close()
        ate_f1, apc_acc, c_apc_acc = atepc_evaluator.evaluate(f_out_name)
        results.append([ate_f1, apc_acc, c_apc_acc])
        print(results[-1])

    print("-----All-----{0}--{1}".format(keras_model_name, data_name))
    for result in results:
        print(result)
    print("-----AVG-----")
    results_np = np.array(results, dtype=np.float32)
    print(results_np.mean(axis=0))
    print("-------------")

예제 #7

0

파일 보기

파일: evaluation.py 프로젝트: jiangnanqi/MATEPC

 def evaluate(self, pred_file, gold_file=""):
     sents, labels, pred_labels = collect_data_from_tsv(pred_file)
     return self._evaluate(labels, pred_labels)

예제 #8

0

파일 보기

파일: features.py 프로젝트: giahy2507/atepc

    }

    print("-----{0}-----{1}-----{2}-----{3}-----".format(
        task_name, data_name, keras_model_name, hand_features))
    save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name)
    train_path = os.path.join(DATA_ROOT,
                              '{0}.{1}.train.tsv'.format(data_name, task_name))
    test_path = os.path.join(DATA_ROOT,
                             '{0}.{1}.test.tsv'.format(data_name, task_name))
    train_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name))
    test_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name))

    # train set
    x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path)
    x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path)

    # test set
    X_test, Y_test, _ = collect_data_from_tsv(test_path)
    X_test_dep = collect_dept_data_from_tsv(test_dep_path)

    # TODO Kfold split
    kf = KFold(n_splits=10)
    i_fold = 0

    pos_fe = POSExtractor()
    print(len(pos_fe.features_dict))

    count_train = 0
    count_valid = 0