Пример #1
0
    def loadFile(self, lines):
        sent = ''
        d = {}

        indsForQuestions = defaultdict(lambda: set())

        for line in lines.split('\n'):
            line = line.strip()
            if not line:
                continue
            data = line.split('\t')
            if len(data) == 1:
                if sent:
                    for ex in d[sent]:
                        ex.indsForQuestions = dict(indsForQuestions)
                sent = line
                d[sent] = []
                indsForQuestions = defaultdict(lambda: set())

            else:
                pred = data[0]
                pred_index = data[1]
                cur = Extraction((pred, all_index(sent, pred, matchCase = False)),
                                 sent,
                                 confidence = 1.0,
                                 question_dist = self.question_dist)
                for q, a in zip(data[2::2], data[3::2]):
                    indices = all_index(sent, a, matchCase = False)
                    cur.addArg((a, indices), q)
                    indsForQuestions[q] = indsForQuestions[q].union(indices)

                if sent:
                    if cur.noPronounArgs():
                        d[sent].append(cur)
        return d
Пример #2
0
    def loadFile(self, lines):
        sent = ''
        d = {}

        indsForQuestions = defaultdict(lambda: set())

        for line in lines.split('\n'):
            line = line.strip()
            if not line:
                continue
            data = line.split('\t')
            if len(data) == 1:
                if sent:
                    for ex in d[sent]:
                        ex.indsForQuestions = dict(indsForQuestions)
                sent = line
                d[sent] = []
                indsForQuestions = defaultdict(lambda: set())

            else:
                pred = self.preproc(data[0])
                pred_indices = map(int, eval(data[1]))
                head_pred_index = int(data[2])
                cur = Extraction((pred, [pred_indices]),
                                 head_pred_index,
                                 sent,
                                 confidence=1.0)

                for q, a in zip(data[3::2], data[4::2]):
                    preproc_arg = self.preproc(a)
                    if not preproc_arg:
                        logging.warn("Argument reduced to None: {}".format(a))
                    indices = fuzzy_match_phrase(preproc_arg.split(" "),
                                                 sent.split(" "))
                    cur.addArg((preproc_arg, indices), q)
                    indsForQuestions[q] = indsForQuestions[q].union(
                        flatten(indices))

                if sent:
                    if cur.noPronounArgs():
                        cur.resolveAmbiguity()
                        d[sent].append(cur)

        return d
Пример #3
0
    def read(self):
        ###What to do about generalized questions that are not yet in this distribution set?####
        ###Use analyze.py####
        question_dist = dict([(q, dict([(int(loc), cnt)
                                             for (loc, cnt)
                                             in dist.iteritems()]))
                                   for (q, dist)
                                   in json.load(open(self.dist_file)).iteritems()]) \
                                       if self.dist_file\
                                          else {}
        ##pull sentence##
        ##pull predicate##
        ##pull qa pairs with 5/6 or more validations##
        ##possibly preprocess at this step##
        #load json lines data into list
        qa_path = self.qa_path
        data = []
        with codecs.open(qa_path, 'r', encoding='utf8') as f:
            for line in f:
                data.append(json.loads(line))

        f_out = open(self.output_file, "w")
        jsonl_out = open('science_eval_sent.jsonl', "w")
        eval_out = open('science_eval.oie', "w")
        verb_types = []
        #parse qa data
        for item in data:
            #for item in data[(len(data)-100):(len(data) - 1)]:
            sent_id = item["sentenceId"].encode('utf-8')
            #remove science
            if sent_id.split(':')[0] != 'TQA':
                continue
            sentence_tokens = item["sentenceTokens"]
            sentence = ' '.join(sentence_tokens)
            sentence = sentence.encode('utf-8')
            if output_eval:
                jsonl_out.write("{" + '"' + "sentence" + '"' + ": " + '"' +
                                sentence + '"' + "}" + '\n')

            for _, verb_entry in item["verbEntries"].items():
                verb_index = verb_entry["verbIndex"]
                verb_inflected_forms = verb_entry["verbInflectedForms"]
                base_pred = sentence_tokens[verb_index]
                surfacePred = base_pred
                answer_list = []
                questions = []

                for _, question_label in verb_entry["questionLabels"].items():
                    #print(question_label["answerJudgments"])
                    answers = len(question_label["answerJudgments"])
                    valid_answers = len([
                        ans for ans in question_label["answerJudgments"]
                        if ans["isValid"]
                    ])
                    if valid_answers / (answers * 1.0) < self.min_correct:
                        #do not log this question set
                        continue
                    q_string = question_label['questionString']
                    ans_spans = []
                    for ans in question_label["answerJudgments"]:
                        if ans["isValid"]:
                            for span in ans["spans"]:
                                ans_spans.append(span)
                    #add long/short flag here
                    consolidated_spans = consolidate_answers(
                        ans_spans, self.length)
                    #look up answers in sentence tokens
                    lookup_ans = lambda ans, sentence: ' '.join(sentence[ans[
                        0]:ans[1]])
                    consolidated_ans = map(lookup_ans, consolidated_spans,
                                           [sentence_tokens] *
                                           len(consolidated_spans))
                    #here we can acquire of the question slots
                    wh = question_label["questionSlots"]["wh"].split()
                    wh = '_'.join(wh)
                    aux = question_label["questionSlots"]["aux"].split()
                    aux = '_'.join(aux)
                    subj = question_label["questionSlots"]["subj"].split()
                    subj = '_'.join(subj)
                    #iterate through and check verb types for len > 2
                    verb_type = question_label['questionSlots']['verb']
                    inflected_verb = verb_inflected_forms[verb_type.split()
                                                          [-1]]
                    if len(verb_type.split()) == 1:
                        trg = inflected_verb
                    else:
                        trg = verb_type.split()[:-1]
                        trg.append(inflected_verb)
                        trg = "_".join(trg)
                    obj1 = question_label["questionSlots"]["obj"].split()
                    obj1 = '_'.join(obj1)
                    pp = question_label["questionSlots"]["prep"].split()
                    pp = '_'.join(pp)
                    obj2 = question_label["questionSlots"]["obj2"].split()
                    obj2 = '_'.join(obj2)

                    slotted_q = " ".join(
                        (wh, aux, subj, trg, obj1, pp, obj2, "?"))

                    curSurfacePred = augment_pred_with_question(
                        base_pred, slotted_q)
                    if len(curSurfacePred) > len(surfacePred):
                        surfacePred = curSurfacePred

                    questions.append(slotted_q)
                    answer_list.append(consolidated_ans)
                    #print wh, subj, obj1
                    #for ans in consolidated_spans:
                    #question_answer_pairs.append((slotted_q,' '.join(sentence_tokens[ans[0]:ans[1]])))

                    ####this needs to be more sophisticated
                    ###for each predicate - create a list of qa pairs, s.t. each unique combination of questions and answers appear
                    ### e.g. 2 quesions each with 2 answers, leads to four qa pairs ((q1,a1),(q2,a1), ((q1,a1),(q2,a2)), ect.
                    ### each one of these sets will lead to an extraction

                    #now we have the augmented Pred with aux
                    #might want to revisit this methodology

                    #augment verb with aux
# =============================================================================
#                     if aux in QA_SRL_AUX_MODIFIERS:
#
#                         if len(verb_type.split()) == 1:
#                                 verb = aux + " " + inflected_verb
#
#                         else:
#                             #add the first modifier in verb tpye
#                             #may need to revisit - in previous approach, it looks like only the surface verb and aux were sent
#                             verb = aux + " " + verb_type.split()[0] + " " + inflected_verb
#
#                     else:
#                         if len(verb_type.split()) == 1:
#                                 verb = inflected_verb
#
#                         else:
#                             verb = verb_type.split()[0] + " " + inflected_verb
#
# =============================================================================
##now we have sentence tokens, verb index, valid question, valid answer spans
##need question blanks for augement pred with question
###for each predicate - create a list of qa pairs, s.t. each unique combination of questions and answers appear
### e.g. 2 quesions each with 2 answers, leads to four qa pairs ((q1,a1),(q2,a1)), ((q1,a1),(q2,a2)), ect.
### each one of these sets will lead to an extraction
##noticing many instances where the rare answer doesn't make sense
##e.g. Clouds that form on the ground are called fog

##what about questions that encode a similar argument? e.g. what for and why
##These organisms need the oxygen plants release to get energy out of the food .
#[(u'what _ _ needs something _ _ ?', u'organisms'), (u'why does something need something _ _ ?', u'to get energy out of the food'), (u'what does something need _ _ _ ?', u'oxygen'), (u'what does someone need something for _ ?', u'to get energy out of the food')]
#need    need    organisms       oxygen  for to get energy out of the food       to get energy out of the food
#Considering the following edits - for each argument, only take the first question that appears for it
#Considering the following edits - Only consider an answer span if it apoears by more than one annotator. Rare answers tend to be misleading
                surfacePred = surfacePred.encode('utf-8')
                base_pred = base_pred.encode('utf-8')
                #pred_indices = all_index(sentence, base_pred, matchCase = False)
                augmented_pred_indices = fuzzy_match_phrase(
                    surfacePred.split(" "), sentence.split(" "))
                #print augmented_pred_indices
                if not augmented_pred_indices:
                    #find equivalent of pred_index
                    head_pred_index = [verb_index]

                else:
                    head_pred_index = augmented_pred_indices[0]
                for ans_set in list(itertools.product(*answer_list)):
                    cur = Extraction((surfacePred, [head_pred_index]),
                                     verb_index,
                                     sentence,
                                     confidence=1.0,
                                     question_dist=self.dist_file)
                    #print 'Extraction', (surfacePred, [head_pred_index]), verb_index, sentence
                    q_as = zip(questions, ans_set)
                    if len(q_as) == 0:
                        continue
                    for q_a in q_as:
                        q = q_a[0].encode('utf-8')
                        a = q_a[1].encode('utf-8')
                        preproc_arg = self.preproc(a)
                        if not preproc_arg:
                            logging.warn(
                                "Argument reduced to None: {}".format(a))
                        indices = fuzzy_match_phrase(preproc_arg.split(" "),
                                                     sentence.split(" "))
                        #print 'q', q, 'preproc arg', preproc_arg, 'indices ', indices
                        cur.addArg((preproc_arg, indices), q)

                    if cur.noPronounArgs():

                        #print 'arguments', (preproc_arg,indices), q
                        cur.resolveAmbiguity()
                        if self.write:
                            #print sentence
                            #print q_as
                            if self.sort:
                                cur.getSortedArgs()
                            #print(cur.conll(external_feats = [1,2]))
                            f_out.write(cur.conll(external_feats=[1, 2]))
                            f_out.write('\n')
                        ### now to get the ordering down
                        ### seems like now and from before, the arguments are in the order they appear in the qa file...
                        ### get sent and word ID
                        ### generating an output file for downstream evaluation on OIE-2016
                        ### evaluation framework
                        if self.output_eval:
                            if self.sort:
                                cur.getSortedArgs()
                            eval_out.write(sentence + ' \t' + cur.__str__() +
                                           '\n')

                        self.extractions.append(cur)