def add_pos_feature_one(params): index, fn, start, end = params lno = 0 ret = [] fix = lambda x: x.replace('-LSB-', '[').replace('-RSB-', ']').replace( '-LCB-', '{').replace('-RCB-', '}').replace('-LRB-', '(').replace( '-RRB-', ')') with open(fn) as fin: while lno < start: fin.readline() lno += 1 parser = NLPParser() for i in xrange(start, end): line = fin.readline() line = line.decode('utf8').strip().split('\t') pattern, sentence = line[-2:] pattern = naive_split(pattern) sentence = naive_split(sentence) line[-2] = ' '.join(pattern) tokens, poss = parser.tag_pos(' '.join(sentence)) tokens = [fix(t) for t in tokens] poss = [u'<START>'] + poss + [u'<END>'] # line = '\t'.join([w + ' ' + p for w, p in zip(line, poss)]) line.append(' '.join(poss)) line.append(' '.join(tokens)) line = ('\t'.join(line)).encode('utf8') ret.append(line) return ret
def tag(self, sentence): if self.use_part_of_speech: sentence, poss = self.get_pos_tag(sentence) else: sentence = naive_split(sentence) poss = None data = self.dataset.create_model_input(sentence, poss) viterbi_sequences, _ = self.model.predict( data['sentence_lengths'], data['word_ids'], data['char_for_ids'], data['char_rev_ids'], data['word_lengths'], data['cap_ids'], data['pos_ids'], ) viterbi_sequence = viterbi_sequences[0] seq_len = data['sentence_lengths'][0] words = data['words'][0][:seq_len] mentions, pred_tag_sequence = self.dataset.get_mention_from_words( words, viterbi_sequence ) # 'mentions' contains start index of each mention mention_to_likelihood = dict() likelihood = self.get_sequence_likelihood(data, viterbi_sequences)[0] for m in mentions: mention_to_likelihood[m] = likelihood res = dict() res['sentence'] = ' '.join(sentence) res['mentions'] = mention_to_likelihood if poss: res['pos'] = poss return res
def gen_data_for_relation_matcher_from_WebQSP(fn_webqsp_list, fn_out, use_aqqu): pipe = Pipeline(use_aqqu) #从问题里提取question->candidate relation的映射 wq = [] for fn in fn_webqsp_list: wq.extend(json.load(open(fn))['Questions']) with open(fn_out, 'w') as fout: for data in wq: question, candidates = pipe.gen_candidate_relations( data['RawQuestion']) pattern_to_correct = dict() for parse in data['Parses']: if not parse['PotentialTopicEntityMention']: continue mention = ' '.join( naive_split(parse['PotentialTopicEntityMention'])) pattern = question.replace(mention, '<$>') if '<$>' not in pattern: print question if parse['InferentialChain']: if pattern not in pattern_to_correct: pattern_to_correct[pattern] = set() pattern_to_correct[pattern].add( parse['InferentialChain'][-1]) for pattern, correct in pattern_to_correct.items(): wrong = candidates - correct print >> fout, json.dumps({ 'question': pattern, "pos_relation": list(correct), "neg_relation": list(wrong) })
def gen_tagged_sentence(fn_list, fn_out, scheme): if scheme == "iob": fn = tag_sentence_iob else: fn = tag_sentence_iobes with open(fn_out, 'w') as fout: for fn_in in fn_list: with open(fn_in) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') entity = ll[0] pattern, sentence = ll[-2:] # pattern = pattern.lower().split() # sentence = sentence.lower().split() pattern = naive_split(pattern) sentence = naive_split(sentence) tagged = fn(pattern, sentence) tags = ' '.join([t[1] for t in tagged]) sentence = ' '.join(t[0] for t in tagged) print >> fout, ('%s\t%s\t%s' % (entity, sentence, tags)).encode('utf8')
def add_pos_feature_remote(fn_in, fn_out, parser): with open(fn_out, 'w') as fout, open(fn_in) as fin: for line in fin: line = line.decode('utf8').strip().split('\t') pattern, sentence = line[-2:] pattern = naive_split(pattern) sentence = naive_split(sentence) line[-2] = ' '.join(pattern) tokens = parser.parse(' '.join(sentence)) poss = [t.pos for t in tokens] tokens = [t.token for t in tokens] _, poss = merge_splited_word(sentence, tokens, poss) if poss == None: continue if len(sentence) != len(poss): print '|'.join(sentence) print '|'.join(tokens) poss = [u'<START>'] + poss + [u'<END>'] # line = '\t'.join([w + ' ' + p for w, p in zip(line, poss)]) line.append(' '.join(poss)) line.append(' '.join(tokens)) line = '\t'.join(line) print >> fout, line.encode('utf8')
def get_candidate_topic_entities(self, question): question = ' '.join(naive_split(question)) tokens = self.parser.parse(question) entities = self.entity_linker.identify_entities_in_tokens(tokens) mid_to_item = dict() for e in entities: mid = e.get_mid() if not mid.startswith('m.'): mid = e.surface_name if mid not in mid_to_item or mid_to_item[mid][ 'entity_score'] < e.surface_score: mid_to_item[mid] = dict() mid_to_item[mid]['topic'] = mid mid_to_item[mid]['mention'] = e.surface_name mid_to_item[mid]['entity_score'] = e.surface_score return question, mid_to_item.values()
def tag_top2(self, sentence): if self.use_part_of_speech: sentence, poss = self.get_pos_tag(sentence) else: sentence = naive_split(sentence) poss = None data = self.dataset.create_model_input(sentence, poss) viterbi_sequences, scores = self.model.predict_top_k( data['sentence_lengths'], data['word_ids'], data['char_for_ids'], data['char_rev_ids'], data['word_lengths'], data['cap_ids'], data['pos_ids'], ) seq_len = data['sentence_lengths'][0] words = data['words'][0][:seq_len] mention_to_likelihood = dict() for k in range(2): if k == 1 and scores[0][1] * 1.0 / scores[0][0] < 0.95: break viterbi_sequence_ = viterbi_sequences[0][k] likelihood = self.get_sequence_likelihood(data, [viterbi_sequence_])[0] pred_entities, pred_tag_sequence = self.dataset.get_mention_from_words( words, viterbi_sequence_) for e in pred_entities: if e not in mention_to_likelihood: mention_to_likelihood[e] = likelihood res = dict() res['mentions'] = mention_to_likelihood res['sentence'] = ' '.join(sentence) if poss: res['pos'] = poss return res
def get_mention_likelihood(self, question, mention): if self.use_part_of_speech: sentence, poss = self.get_pos_tag(question) else: sentence = naive_split(question) poss = None mention = mention.split() data = self.dataset.create_model_input(sentence, poss) start = find_word(sentence, mention) end = start + len(mention) tag_ids = self.dataset.create_tag_sequence(start, end, len(sentence), self.tag_scheme) scores = self.model.get_likelihood( tag_ids, data['sentence_lengths'], data['word_ids'], data['char_for_ids'], data['char_rev_ids'], data['word_lengths'], data['cap_ids'], data['pos_ids'], ) return question, scores.tolist()[0]
def gen_data_for_relation_matcher(fn_webquestion_list, fn_simplequstion_list, fn_out): pipeline = Pipeline() # symbols = {"_", "_'s", "_'"} def map_word(x): if x.startswith('_') or x.endswith('_'): return x.replace('_', "<$>") else: return x with open(fn_out, 'w') as fout: # process simple question for fn in fn_simplequstion_list: with open(fn) as fin: for line in fin: _, positive_relation, _, pattern, question = line.decode( 'utf8').strip().split('\t') # question, candidate_relations = pipeline.gen_candidate_relations(question) pattern = ' '.join( naive_split(' '.join( [map_word(w) for w in pattern.split()]))) # negative_relations = candidate_relations - {positive_relation} print >> fout, json.dumps( { "question": pattern, "pos_relation": [positive_relation], # "neg_relation": list(negative_relations)}, "neg_relation": [] }, ensure_ascii=False).encode('utf8') # process webquestion for fn in fn_webquestion_list: webquestion = json.load(open(fn), encoding="utf8") for data in webquestion: positive_relations = set() for path in data['paths']: if path[1] not in { "forward_pass_non_cvt", "forward_pass_cvt", "forward_direct" }: raise ValueError('path type error') if path[1] == "forward_pass_cvt" or path[ 1] == "forward_direct": positive_relations.add(path[0].split()[-2]) if len(positive_relations) == 0: continue pattern = ' '.join( naive_split(' '.join( [map_word(w) for w in data['sentence'].split()]))) question, candidate_relations = pipeline.gen_candidate_relations( data['utterance']) negative_relations = candidate_relations - positive_relations print >> fout, json.dumps( { "question": pattern, "pos_relation": list(positive_relations), "neg_relation": list(negative_relations) }, ensure_ascii=False).encode('utf8')
def get_pos_tag(self, sentence): sentence = naive_split(sentence) tokens, poss = self.nlp_parser.tag_pos(' '.join(sentence)) tokens = [self.fix(t) for t in tokens] return tokens, poss