Пример #1
0
 def generate_for_file(path):
     for sen in CoNLL2005Reader(path).read_all():
         to_process = True
         if len(sen) > self.max_len > 0:
             to_process = False
         if to_process:
             if len(sen.get_predicates()) > 0:
                 for w in sen:
                     wf = w.form.lower() if self.do_lowercase else w.form
                     if wf not in self.words:
                         self.add_to_vob(wf, self.extra_words)
Пример #2
0
    def get_data_eval_not_in_vob(self, eval_path, data=None):
        eval = []
        if data is None:
            data = CoNLL2005Reader(eval_path).read_all()
        for sen in data:

            dt = self.process_sentence_not_in_vob(sen, is_train=False)

            if len(dt) > 0:
                eval.extend(dt)
        return eval
Пример #3
0
        def generate_for_file(path, is_train=True):
            for sen in CoNLL2005Reader(path).read_all():
                to_process = True
                if 0 < self.max_len < len(sen):
                    to_process = False
                if to_process:

                    for w in sen:
                        wf = w.form.lower() if self.do_lowercase else w.form
                        if is_train:
                            self.add_to_vob(wf, self.words, self.word_count)
                        else:
                            self.add_to_vob(wf, self.words)
Пример #4
0
    def get_data_train(self, train_path, data=None):
        train = []
        if data is None:
            data = CoNLL2005Reader(train_path).read_all()

        for sen in data:
            if self.max_len > 0:
                if len(sen) > self.max_len:
                    continue

            dt = self.process_sentence_not_in_vob(sen)

            if len(dt) > 0:
                train.extend(dt)
        return train
Пример #5
0
        def generate_for_file(path, is_train=True):
            for sen in CoNLL2005Reader(path).read_all():
                to_process = True
                if 0 < self.max_len < len(sen):
                    to_process = False
                if to_process:
                    if len(sen.get_predicates()) > 0:
                        for w in sen:
                            wf = w.form.lower() if self.do_lowercase else w.form
                            w_ner = w.entity_tag
                            self.add_to_vob(wf, self.words, self.word_count) if is_train else self.add_to_vob(wf, self.words)
                            self.add_to_vob(w_ner, self.labels)

                        for p in sen.get_predicates():
                            for arg in p.arguments:
                                self.add_to_vob(arg, self.labels)
Пример #6
0
    def conll_evaluate(self, test_input, test, test_path):

        if not os.path.exists(test_path + "_props.txt"):
            gtest = CoNLL2005Reader(test_path).read_all()
            write_props(gtest, test_path + "_props.txt")

        current_epoch = test_input.epochs

        predictions = []

        all_predictions = []

        while test_input.epochs == current_epoch:
            (inputs_vals, predicate_vals,
             srl_target_vals), seq_len_vals = test_input.next_batch(
                 self.cfg.batch_dev_size)

            char_vals, char_lens = None, None
            if "char" in self.cfg.emb_type:
                char_vals, char_lens = self.charpp.get_data(inputs_vals)

            feeddct_err = self.get_feed(inputs_vals=inputs_vals,
                                        predicate_vals=predicate_vals,
                                        srl_target_vals=srl_target_vals,
                                        ner_target_vals=None,
                                        seq_len_vals=seq_len_vals,
                                        lr=self.cfg.learning_rate,
                                        char_vals=char_vals,
                                        char_lens=char_lens,
                                        is_train=False)

            if self.cfg.top_type == "crf":
                scoresp, trans_params = self.sess.run(
                    [self.logits, self.trans_params], feed_dict=feeddct_err)
                tmp = []
                for score, sequence_length in zip(scoresp.tolist(),
                                                  seq_len_vals):
                    score = np.asarray(
                        score[:sequence_length])  # keep only the valid steps
                    viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                        score, trans_params)
                    tmp.append(viterbi_seq)
                all_predictions.extend(tmp)
            else:
                if self.cfg.inference_type == "A*":
                    [score] = self.sess.run([self.logits],
                                            feed_dict=feeddct_err)
                    predictions.extend(score.tolist())
                elif self.cfg.inference_type == "argmax":
                    score, predicted_labels = self.sess.run(
                        [self.logits, self.prediction], feed_dict=feeddct_err)
                    predicted_labels = predicted_labels.tolist()
                    x = []
                    for lbls in predicted_labels:
                        x.append([self.pp.labels[idx] for idx in lbls])
                    all_predictions.extend(x)

        if self.cfg.top_type == "softmax" and self.cfg.inference_type == "A*":
            for i, scores in enumerate(predictions):
                predictions_infe = self.inference(
                    scores, self.get_transition_params(self.pp.labels))
                all_predictions.append(
                    [self.pp.labels[idx] for idx in predictions_infe[0]])

        pos = 0
        for sen in test:
            for pred in sen.get_predicates():
                pred.arguments = []
        for sen in test:
            for pred in sen.get_predicates():
                temps = all_predictions[pos][0:len(sen)]
                if self.cfg.top_type == "crf":
                    temps = [self.pp.labels[idx] for idx in temps]
                pred.arguments = self.post_processing(temps)
                pos += 1

        write_props(test, test_path + "_out.txt")
        write_short_conll2005_format(test, test_path + "_full_output.txt")
        finalscores = self.get_CoNLL2001Score(test_path + "_props.txt",
                                              test_path + "_out.txt")
        return {
            "conll": finalscores[2],
            "P": finalscores[0],
            "R": finalscores[1]
        }
Пример #7
0
    def init(self):
        def get_data_list():
            if self.cfg.mode == "train":
                lst = []
                if self.cfg.train_path is not None:
                    lst.append(self.cfg.train_path)

                if self.cfg.dev_path is not None:
                    lst.append(self.cfg.dev_path)

                return lst
            if self.cfg.mode == "infer":
                if self.cfg.test_path is not None:

                    return self.cfg.test_path

        if not os.path.exists(self.cfg.model_dir):
            if self.cfg.mode == "infer":
                self.logger.error("Model dir does not exist!")
                sys.exit(0)
            else:
                os.makedirs(self.cfg.model_dir)

        self.pp = DataProcessor(model_dir=self.cfg.model_dir)
        self.pp.max_len = self.cfg.max_train_length

        if self.cfg.mode == "train":

            if self.pp.is_empty():
                self.pp.generate_vob(get_data_list())
                if self.cfg.smooth_rare_words > 0:
                    self.pp.smooth_word_vob(self.cfg.smooth_rare_words)
                self.pp.export_vobs(self.cfg.model_dir)

            else:
                if self.cfg.extend_vob:
                    self.pp.extend_vob(get_data_list())
                    self.pp.export_vobs(self.cfg.model_dir)

            self.train_data = self.pp.get_data_train(self.cfg.train_path,
                                                     data=self.train_data)

            self.dev_sentences = CoNLL2005Reader(self.cfg.dev_path).read_all()
            self.dev_data = self.pp.get_data_eval(self.cfg.dev_path,
                                                  self.dev_sentences)

        if self.cfg.mode == "infer" and self.cfg.extend_vob:
            self.pp.extend_vob(get_data_list())
            self.pp.export_vobs(self.cfg.model_dir)

        if "char" in self.cfg.emb_type:
            self.charpp = CharProcessor(self.pp.words)

        self.my_summary.import_data(self.cfg.model_dir)
        # start logging
        self.logger = logging.getLogger('logger')
        self.logger.setLevel(logging.DEBUG)
        logging.basicConfig(format='%(message)s', level=logging.DEBUG)
        handler = logging.FileHandler(self.cfg.log_file)
        handler.setLevel(logging.DEBUG)
        handler.setFormatter(
            logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
        logging.getLogger().addHandler(handler)
Пример #8
0
 def evaluate(self, path):
     test = CoNLL2005Reader(path).read_all()
     test_data = self.pp.get_data_eval(path, test)
     test_input = PaddedDataIteratorSEQ(test_data)
     print(self.conll_evaluate(test_input, test, path))
Пример #9
0
from liir.dame.srl.DataProcessor import DataProcessor
from liir.dame.core.nn.Data import BucketedDataIteratorSEQ
from liir.dame.core.io.CoNLL2005Reader import CoNLL2005Reader
import sys
if __name__ == "__main__":
    train_data_path = sys.argv[1]
    pp = DataProcessor(model_dir="test-bucket-data")
    pp.generate_vob(train_data_path)
    train_data_sens = CoNLL2005Reader(train_data_path).read_all()
    train_data = pp.get_data_train(train_data_path , data=train_data_sens)
    print("---------------Train data-------------")
    print(train_data)
    print("---------------Processed with Bucket-data-------------")
    train_input = BucketedDataIteratorSEQ(dt=train_data, num_buckets=2)
    print(train_input)
    print("---------------Processed with Bucket-data and next batch-------------")
    (inputs_vals, predicate_vals, srl_target_vals, ner_target_vals), seq_len_vals = train_input.next_batch(2)
    print(('input vals', inputs_vals))
    print(('predicate vals', predicate_vals))
    print(('srl target vals', srl_target_vals))
    print(('ner target vals', ner_target_vals))
    print(('seq len vals', seq_len_vals))
    pass
Пример #10
0
        'royalblue': '#4169E1',
        'saddlebrown': '#8B4513',
        'salmon': '#FA8072',
        'sandybrown': '#FAA460',
        'seagreen': '#2E8B57',
        'sienna': '#A0522D',
        'silver': '#C0C0C0',
        'skyblue': '#87CEEB',
        'slateblue': '#6A5ACD',
        'slategray': '#708090',
        'springgreen': '#00FF7F',
        'steelblue': '#4682B4',
        'tan': '#D2B48C',
        'teal': '#008080',
        'thistle': '#D8BFD8',
        'tomato': '#FF6347',
        'turquoise': '#40E0D0',
        'violet': '#EE82EE',
        'wheat': '#F5DEB3',
        'yellowgreen': '#9ACD32'
    }


if __name__ == "__main__":
    import sys

    writer = HTMLWriter()
    reader = CoNLL2005Reader(sys.argv[1])

    writer.write_props(reader.read_all(), sys.argv[2])