예제 #1
0
파일: main.py 프로젝트: yxd126/NER_clm_zoe
 def evaluate_dataset(self,
                      file_name,
                      mode,
                      do_inference=True,
                      use_prior=True,
                      use_context=True,
                      size=-1):
     if not os.path.isfile(file_name):
         print("[ERROR] Invalid input data file.")
         return
     self.inference_processor = InferenceProcessor(mode, do_inference,
                                                   use_prior, use_context)
     dataset = DataReader(file_name, size)
     for sentence in dataset.sentences:
         processed = self.process_sentence(sentence)
         if processed == -1:
             continue
         self.evaluated.append(processed)
         processed.print_self()
         evaluator = Evaluator()
         evaluator.print_performance(self.evaluated)
예제 #2
0
    def handle_input(self):
        start_time = time.time()
        ret = {}
        r = request.get_json()
        if "tokens" not in r or "mention_starts" not in r or "mention_ends" not in r or "index" not in r:
            ret["type"] = [["INVALID_INPUT"]]
            ret["index"] = -1
            ret["mentions"] = []
            ret["candidates"] = [[]]
            return json.dumps(ret)
        sentences = []
        for i in range(0, len(r["mention_starts"])):
            sentence = Sentence(r["tokens"], int(r["mention_starts"][i]),
                                int(r["mention_ends"][i]), "")
            sentences.append(sentence)
        mode = r["mode"]
        predicted_types = []
        predicted_candidates = []
        other_possible_types = []
        selected_candidates = []
        mentions = []
        if mode != "figer":
            if mode != "custom":
                selected_inference_processor = InferenceProcessor(
                    mode, resource_loader=self.runner.inference_processor)
            else:
                rules = r["taxonomy"]
                mappings = self.parse_custom_rules(rules)
                selected_inference_processor = InferenceProcessor(
                    mode, custom_mapping=mappings)
        else:
            selected_inference_processor = self.runner.inference_processor

        for sentence in sentences:
            sentence.set_signature(selected_inference_processor.signature())
            cached = self.mem_cache.query_cache(sentence)
            if cached is not None:
                sentence = cached
            else:
                self.runner.process_sentence(sentence,
                                             selected_inference_processor)
                try:
                    self.mem_cache.insert_cache(sentence)
                    self.surface_cache.insert_cache(sentence)
                except:
                    print("Cache insertion exception. Ignored.")
            predicted_types.append(list(sentence.predicted_types))
            predicted_candidates.append(sentence.elmo_candidate_titles)
            mentions.append(sentence.get_mention_surface_raw())
            selected_candidates.append(sentence.selected_title)
            other_possible_types.append(sentence.could_also_be_types)

        elapsed_time = time.time() - start_time
        print("Processed mention " +
              str([x.get_mention_surface() for x in sentences]) + " in mode " +
              mode + ". TIME: " + str(elapsed_time) + " seconds.")
        ret["type"] = predicted_types
        ret["candidates"] = predicted_candidates
        ret["mentions"] = mentions
        ret["index"] = r["index"]
        ret["selected_candidates"] = selected_candidates
        ret["other_possible_type"] = other_possible_types
        return json.dumps(ret)
예제 #3
0
        file.write(sen.tokens[i] + '\t' + p_labels[i] + '\t' + gold_labels[i] +
                   '\n')
    file.write('\n')


# file_name = 'CoNLL_dev'
file_name = 'On'

freebase_file = open('data/title2freebase.pickle', 'rb')

freebase = pickle.load(freebase_file)
prediction_data = ReadData('result_' + file_name + '.out')

outfile = open('fixed_result_' + file_name + '.out', 'w')

inference_processor = InferenceProcessor("ontonotes")

prior_threshold = 0.5

for sen in prediction_data:
    for idx, prediction in enumerate(sen.predictions):
        surface = '_'.join(sen.tokens[prediction[0]:prediction[1]])
        if surface not in freebase:
            if surface[0] + surface[1:].lower() in freebase:
                surface = surface[0] + surface[1:].lower()
            else:
                if surface.upper() in freebase:
                    surface = surface.upper()
                else:
                    if surface.lower() in freebase:
                        surface = surface.lower()
예제 #4
0
파일: main.py 프로젝트: yxd126/NER_clm_zoe
 def __init__(self):
     self.bert_processor = BertProcessor()
     self.esa_processor = EsaProcessor()
     self.inference_processor = InferenceProcessor("ontonotes")
     self.evaluator = Evaluator()
     self.evaluated = []
예제 #5
0
파일: main.py 프로젝트: yxd126/NER_clm_zoe
class ZoeRunner:
    """
    @allow_tensorflow sets whether the system will do run-time ELMo processing.
                      It's set to False in experiments as ELMo results are cached,
                      but please set it to default True when running on new sentences.
    """
    def __init__(self):
        self.bert_processor = BertProcessor()
        self.esa_processor = EsaProcessor()
        self.inference_processor = InferenceProcessor("ontonotes")
        self.evaluator = Evaluator()
        self.evaluated = []

    """
    Process a single sentence
    @sentence: a sentence in zoe_utils.Sentence structure
    @return: a sentence in zoe_utils that has predicted types set
    """

    def process_sentence(self, sentence, inference_processor=None):
        esa_candidates = self.esa_processor.get_candidates(sentence)
        bert_candidates = self.bert_processor.rank_candidates(
            sentence, esa_candidates)
        if inference_processor is None:
            inference_processor = self.inference_processor
        print("Ranking finished")
        inference_processor.inference(sentence, bert_candidates,
                                      esa_candidates)
        return sentence

    """
    Helper function to evaluate on a dataset that has multiple sentences
    @file_name: A string indicating the data file. 
                Note the format needs to be the common json format, see examples
    @mode: A string indicating the mode. This adjusts the inference mode, and set caches etc.
    @return: None
    """

    def evaluate_dataset(self,
                         file_name,
                         mode,
                         do_inference=True,
                         use_prior=True,
                         use_context=True,
                         size=-1):
        if not os.path.isfile(file_name):
            print("[ERROR] Invalid input data file.")
            return
        self.inference_processor = InferenceProcessor(mode, do_inference,
                                                      use_prior, use_context)
        dataset = DataReader(file_name, size)
        for sentence in dataset.sentences:
            processed = self.process_sentence(sentence)
            if processed == -1:
                continue
            self.evaluated.append(processed)
            processed.print_self()
            evaluator = Evaluator()
            evaluator.print_performance(self.evaluated)

    """
    Helper function that saves the predicted sentences list to a file.
    @file_name: A string indicating the target file path. 
                Note it will override the content
    @return: None
    """

    def save(self, file_name):
        with open(file_name, "wb") as handle:
            pickle.dump(self.evaluated, handle, pickle.HIGHEST_PROTOCOL)

    @staticmethod
    def evaluate_saved_runlog(log_name):
        with open(log_name, "rb") as handle:
            sentences = pickle.load(handle)
        evaluator = Evaluator()
        evaluator.print_performance(sentences)

    def find_best_prediction_seq(self, i, choice):
        if i == -1:
            return []
        if i not in choice:
            result = self.find_best_prediction_seq(i - 1, choice)
            return result
        else:
            result = self.find_best_prediction_seq(choice[i][0] - 1, choice)
            result.append(choice[i])
            return result

    def dp(self, sen, cands, scores):
        f = [0] * len(sen.tokens)
        choice = {}
        f[0] = 0
        for i in range(0, len(sen.tokens)):
            # if i > 0:
            #     f[i] = f[i-1]
            for mention in cands:
                if mention[1] - 1 > i:
                    break
                # if f[i] == 0:
                #     f[i] = f[mention[0]-1] + scores[mention]
                # else:
                if f[mention[0] - 1] + scores[mention] > f[i]:
                    f[i] = f[mention[0] - 1] + scores[mention]
                    choice[i] = mention
        final_prediction = self.find_best_prediction_seq(
            len(sen.tokens) - 1, choice)
        # print(choice)
        return final_prediction

    def print_prediction(self, outfile, sen, prediction, types):
        p_labels = ['O'] * len(sen.tokens)
        for mention in prediction:
            if types[mention] == 'O':
                p_labels[mention[0]] = 'O'
            else:
                p_labels[mention[0]] = 'B-' + types[mention]
            for i in range(mention[0] + 1, mention[1]):
                if types[mention] == 'O':
                    p_labels[i] = 'O'
                else:
                    p_labels[i] = 'I-' + types[mention]

        gold_labels = ['O'] * len(sen.tokens)
        for mention in sen.gold:
            gold_labels[mention[0]] = 'B-' + mention[2]
            for i in range(mention[0] + 1, mention[1]):
                gold_labels[i] = 'I-' + mention[2]

        for i in range(0, len(sen.tokens)):
            outfile.write(sen.tokens[i] + '\t' + p_labels[i] + '\t' +
                          gold_labels[i] + '\n')
        outfile.write('\n')

    def post_processing(self, sen):
        surface = ' '.join(
            sen.tokens[sen.mention_start:sen.mention_end]).lower()

        if surface in self.inference_processor.prior_prob_map:
            prior_prob = self.inference_processor.prior_prob_map[surface]
            if float(prior_prob[1]) > 0.7:
                type = self.inference_processor.get_coarse_types_of_title(
                    prior_prob[0])
                if len(type) == 0:
                    return sen
                # print(type)
                type = list(type)[0]
                if type == '':
                    return sen
                print(surface, prior_prob[0], type)
                sen.set_predictions(type, 1)
        return sen
예제 #6
0
 def __init__(self, allow_tensorflow=True):
     self.elmo_processor = ElmoProcessor(allow_tensorflow)
     self.esa_processor = EsaProcessor()
     self.inference_processor = InferenceProcessor("figer")
     self.evaluator = Evaluator()
     self.evaluated = []