def loadFile(self, lines): sent = '' d = {} indsForQuestions = defaultdict(lambda: set()) for line in lines.split('\n'): line = line.strip() if not line: continue data = line.split('\t') if len(data) == 1: if sent: for ex in d[sent]: ex.indsForQuestions = dict(indsForQuestions) sent = line d[sent] = [] indsForQuestions = defaultdict(lambda: set()) else: pred = data[0] pred_index = data[1] cur = Extraction((pred, all_index(sent, pred, matchCase = False)), sent, confidence = 1.0, question_dist = self.question_dist) for q, a in zip(data[2::2], data[3::2]): indices = all_index(sent, a, matchCase = False) cur.addArg((a, indices), q) indsForQuestions[q] = indsForQuestions[q].union(indices) if sent: if cur.noPronounArgs(): d[sent].append(cur) return d
def loadFile(self, lines): sent = '' d = {} indsForQuestions = defaultdict(lambda: set()) for line in lines.split('\n'): line = line.strip() if not line: continue data = line.split('\t') if len(data) == 1: if sent: for ex in d[sent]: ex.indsForQuestions = dict(indsForQuestions) sent = line d[sent] = [] indsForQuestions = defaultdict(lambda: set()) else: pred = self.preproc(data[0]) pred_indices = map(int, eval(data[1])) head_pred_index = int(data[2]) cur = Extraction((pred, [pred_indices]), head_pred_index, sent, confidence=1.0) for q, a in zip(data[3::2], data[4::2]): preproc_arg = self.preproc(a) if not preproc_arg: logging.warn("Argument reduced to None: {}".format(a)) indices = fuzzy_match_phrase(preproc_arg.split(" "), sent.split(" ")) cur.addArg((preproc_arg, indices), q) indsForQuestions[q] = indsForQuestions[q].union( flatten(indices)) if sent: if cur.noPronounArgs(): cur.resolveAmbiguity() d[sent].append(cur) return d
def read(self): ###What to do about generalized questions that are not yet in this distribution set?#### ###Use analyze.py#### question_dist = dict([(q, dict([(int(loc), cnt) for (loc, cnt) in dist.iteritems()])) for (q, dist) in json.load(open(self.dist_file)).iteritems()]) \ if self.dist_file\ else {} ##pull sentence## ##pull predicate## ##pull qa pairs with 5/6 or more validations## ##possibly preprocess at this step## #load json lines data into list qa_path = self.qa_path data = [] with codecs.open(qa_path, 'r', encoding='utf8') as f: for line in f: data.append(json.loads(line)) f_out = open(self.output_file, "w") jsonl_out = open('science_eval_sent.jsonl', "w") eval_out = open('science_eval.oie', "w") verb_types = [] #parse qa data for item in data: #for item in data[(len(data)-100):(len(data) - 1)]: sent_id = item["sentenceId"].encode('utf-8') #remove science if sent_id.split(':')[0] != 'TQA': continue sentence_tokens = item["sentenceTokens"] sentence = ' '.join(sentence_tokens) sentence = sentence.encode('utf-8') if output_eval: jsonl_out.write("{" + '"' + "sentence" + '"' + ": " + '"' + sentence + '"' + "}" + '\n') for _, verb_entry in item["verbEntries"].items(): verb_index = verb_entry["verbIndex"] verb_inflected_forms = verb_entry["verbInflectedForms"] base_pred = sentence_tokens[verb_index] surfacePred = base_pred answer_list = [] questions = [] for _, question_label in verb_entry["questionLabels"].items(): #print(question_label["answerJudgments"]) answers = len(question_label["answerJudgments"]) valid_answers = len([ ans for ans in question_label["answerJudgments"] if ans["isValid"] ]) if valid_answers / (answers * 1.0) < self.min_correct: #do not log this question set continue q_string = question_label['questionString'] ans_spans = [] for ans in question_label["answerJudgments"]: if ans["isValid"]: for span in ans["spans"]: ans_spans.append(span) #add long/short flag here consolidated_spans = consolidate_answers( ans_spans, self.length) #look up answers in sentence tokens lookup_ans = lambda ans, sentence: ' '.join(sentence[ans[ 0]:ans[1]]) consolidated_ans = map(lookup_ans, consolidated_spans, [sentence_tokens] * len(consolidated_spans)) #here we can acquire of the question slots wh = question_label["questionSlots"]["wh"].split() wh = '_'.join(wh) aux = question_label["questionSlots"]["aux"].split() aux = '_'.join(aux) subj = question_label["questionSlots"]["subj"].split() subj = '_'.join(subj) #iterate through and check verb types for len > 2 verb_type = question_label['questionSlots']['verb'] inflected_verb = verb_inflected_forms[verb_type.split() [-1]] if len(verb_type.split()) == 1: trg = inflected_verb else: trg = verb_type.split()[:-1] trg.append(inflected_verb) trg = "_".join(trg) obj1 = question_label["questionSlots"]["obj"].split() obj1 = '_'.join(obj1) pp = question_label["questionSlots"]["prep"].split() pp = '_'.join(pp) obj2 = question_label["questionSlots"]["obj2"].split() obj2 = '_'.join(obj2) slotted_q = " ".join( (wh, aux, subj, trg, obj1, pp, obj2, "?")) curSurfacePred = augment_pred_with_question( base_pred, slotted_q) if len(curSurfacePred) > len(surfacePred): surfacePred = curSurfacePred questions.append(slotted_q) answer_list.append(consolidated_ans) #print wh, subj, obj1 #for ans in consolidated_spans: #question_answer_pairs.append((slotted_q,' '.join(sentence_tokens[ans[0]:ans[1]]))) ####this needs to be more sophisticated ###for each predicate - create a list of qa pairs, s.t. each unique combination of questions and answers appear ### e.g. 2 quesions each with 2 answers, leads to four qa pairs ((q1,a1),(q2,a1), ((q1,a1),(q2,a2)), ect. ### each one of these sets will lead to an extraction #now we have the augmented Pred with aux #might want to revisit this methodology #augment verb with aux # ============================================================================= # if aux in QA_SRL_AUX_MODIFIERS: # # if len(verb_type.split()) == 1: # verb = aux + " " + inflected_verb # # else: # #add the first modifier in verb tpye # #may need to revisit - in previous approach, it looks like only the surface verb and aux were sent # verb = aux + " " + verb_type.split()[0] + " " + inflected_verb # # else: # if len(verb_type.split()) == 1: # verb = inflected_verb # # else: # verb = verb_type.split()[0] + " " + inflected_verb # # ============================================================================= ##now we have sentence tokens, verb index, valid question, valid answer spans ##need question blanks for augement pred with question ###for each predicate - create a list of qa pairs, s.t. each unique combination of questions and answers appear ### e.g. 2 quesions each with 2 answers, leads to four qa pairs ((q1,a1),(q2,a1)), ((q1,a1),(q2,a2)), ect. ### each one of these sets will lead to an extraction ##noticing many instances where the rare answer doesn't make sense ##e.g. Clouds that form on the ground are called fog ##what about questions that encode a similar argument? e.g. what for and why ##These organisms need the oxygen plants release to get energy out of the food . #[(u'what _ _ needs something _ _ ?', u'organisms'), (u'why does something need something _ _ ?', u'to get energy out of the food'), (u'what does something need _ _ _ ?', u'oxygen'), (u'what does someone need something for _ ?', u'to get energy out of the food')] #need need organisms oxygen for to get energy out of the food to get energy out of the food #Considering the following edits - for each argument, only take the first question that appears for it #Considering the following edits - Only consider an answer span if it apoears by more than one annotator. Rare answers tend to be misleading surfacePred = surfacePred.encode('utf-8') base_pred = base_pred.encode('utf-8') #pred_indices = all_index(sentence, base_pred, matchCase = False) augmented_pred_indices = fuzzy_match_phrase( surfacePred.split(" "), sentence.split(" ")) #print augmented_pred_indices if not augmented_pred_indices: #find equivalent of pred_index head_pred_index = [verb_index] else: head_pred_index = augmented_pred_indices[0] for ans_set in list(itertools.product(*answer_list)): cur = Extraction((surfacePred, [head_pred_index]), verb_index, sentence, confidence=1.0, question_dist=self.dist_file) #print 'Extraction', (surfacePred, [head_pred_index]), verb_index, sentence q_as = zip(questions, ans_set) if len(q_as) == 0: continue for q_a in q_as: q = q_a[0].encode('utf-8') a = q_a[1].encode('utf-8') preproc_arg = self.preproc(a) if not preproc_arg: logging.warn( "Argument reduced to None: {}".format(a)) indices = fuzzy_match_phrase(preproc_arg.split(" "), sentence.split(" ")) #print 'q', q, 'preproc arg', preproc_arg, 'indices ', indices cur.addArg((preproc_arg, indices), q) if cur.noPronounArgs(): #print 'arguments', (preproc_arg,indices), q cur.resolveAmbiguity() if self.write: #print sentence #print q_as if self.sort: cur.getSortedArgs() #print(cur.conll(external_feats = [1,2])) f_out.write(cur.conll(external_feats=[1, 2])) f_out.write('\n') ### now to get the ordering down ### seems like now and from before, the arguments are in the order they appear in the qa file... ### get sent and word ID ### generating an output file for downstream evaluation on OIE-2016 ### evaluation framework if self.output_eval: if self.sort: cur.getSortedArgs() eval_out.write(sentence + ' \t' + cur.__str__() + '\n') self.extractions.append(cur)