示例#1
0
def add_pos_feature_one(params):
    index, fn, start, end = params
    lno = 0
    ret = []
    fix = lambda x: x.replace('-LSB-', '[').replace('-RSB-', ']').replace(
        '-LCB-', '{').replace('-RCB-', '}').replace('-LRB-', '(').replace(
            '-RRB-', ')')
    with open(fn) as fin:
        while lno < start:
            fin.readline()
            lno += 1

        parser = NLPParser()
        for i in xrange(start, end):
            line = fin.readline()
            line = line.decode('utf8').strip().split('\t')
            pattern, sentence = line[-2:]
            pattern = naive_split(pattern)
            sentence = naive_split(sentence)
            line[-2] = ' '.join(pattern)
            tokens, poss = parser.tag_pos(' '.join(sentence))
            tokens = [fix(t) for t in tokens]
            poss = [u'<START>'] + poss + [u'<END>']
            # line = '\t'.join([w + ' ' + p for w, p in zip(line, poss)])
            line.append(' '.join(poss))
            line.append(' '.join(tokens))
            line = ('\t'.join(line)).encode('utf8')
            ret.append(line)
    return ret
示例#2
0
文件: predict.py 项目: mindis/KBQA
    def tag(self, sentence):
        if self.use_part_of_speech:
            sentence, poss = self.get_pos_tag(sentence)
        else:
            sentence = naive_split(sentence)
            poss = None
        data = self.dataset.create_model_input(sentence, poss)
        viterbi_sequences, _ = self.model.predict(
            data['sentence_lengths'],
            data['word_ids'],
            data['char_for_ids'],
            data['char_rev_ids'],
            data['word_lengths'],
            data['cap_ids'],
            data['pos_ids'],
        )
        viterbi_sequence = viterbi_sequences[0]
        seq_len = data['sentence_lengths'][0]

        words = data['words'][0][:seq_len]
        mentions, pred_tag_sequence = self.dataset.get_mention_from_words(
            words, viterbi_sequence
        )  # 'mentions' contains start index of each mention
        mention_to_likelihood = dict()
        likelihood = self.get_sequence_likelihood(data, viterbi_sequences)[0]

        for m in mentions:
            mention_to_likelihood[m] = likelihood
        res = dict()
        res['sentence'] = ' '.join(sentence)
        res['mentions'] = mention_to_likelihood
        if poss:
            res['pos'] = poss
        return res
示例#3
0
def gen_data_for_relation_matcher_from_WebQSP(fn_webqsp_list, fn_out,
                                              use_aqqu):
    pipe = Pipeline(use_aqqu)
    #从问题里提取question->candidate relation的映射
    wq = []
    for fn in fn_webqsp_list:
        wq.extend(json.load(open(fn))['Questions'])
    with open(fn_out, 'w') as fout:
        for data in wq:

            question, candidates = pipe.gen_candidate_relations(
                data['RawQuestion'])
            pattern_to_correct = dict()
            for parse in data['Parses']:
                if not parse['PotentialTopicEntityMention']:
                    continue
                mention = ' '.join(
                    naive_split(parse['PotentialTopicEntityMention']))
                pattern = question.replace(mention, '<$>')
                if '<$>' not in pattern:
                    print question
                if parse['InferentialChain']:
                    if pattern not in pattern_to_correct:
                        pattern_to_correct[pattern] = set()
                    pattern_to_correct[pattern].add(
                        parse['InferentialChain'][-1])
            for pattern, correct in pattern_to_correct.items():
                wrong = candidates - correct
                print >> fout, json.dumps({
                    'question': pattern,
                    "pos_relation": list(correct),
                    "neg_relation": list(wrong)
                })
示例#4
0
def gen_tagged_sentence(fn_list, fn_out, scheme):
    if scheme == "iob":
        fn = tag_sentence_iob
    else:
        fn = tag_sentence_iobes
    with open(fn_out, 'w') as fout:
        for fn_in in fn_list:
            with open(fn_in) as fin:
                for line in fin:
                    ll = line.decode('utf8').strip().split('\t')
                    entity = ll[0]
                    pattern, sentence = ll[-2:]
                    # pattern = pattern.lower().split()
                    # sentence = sentence.lower().split()
                    pattern = naive_split(pattern)
                    sentence = naive_split(sentence)
                    tagged = fn(pattern, sentence)
                    tags = ' '.join([t[1] for t in tagged])
                    sentence = ' '.join(t[0] for t in tagged)
                    print >> fout, ('%s\t%s\t%s' %
                                    (entity, sentence, tags)).encode('utf8')
示例#5
0
def add_pos_feature_remote(fn_in, fn_out, parser):
    with open(fn_out, 'w') as fout, open(fn_in) as fin:
        for line in fin:
            line = line.decode('utf8').strip().split('\t')
            pattern, sentence = line[-2:]
            pattern = naive_split(pattern)
            sentence = naive_split(sentence)
            line[-2] = ' '.join(pattern)
            tokens = parser.parse(' '.join(sentence))
            poss = [t.pos for t in tokens]
            tokens = [t.token for t in tokens]
            _, poss = merge_splited_word(sentence, tokens, poss)
            if poss == None:
                continue
            if len(sentence) != len(poss):
                print '|'.join(sentence)
                print '|'.join(tokens)

            poss = [u'<START>'] + poss + [u'<END>']
            # line = '\t'.join([w + ' ' + p for w, p in zip(line, poss)])
            line.append(' '.join(poss))
            line.append(' '.join(tokens))
            line = '\t'.join(line)
            print >> fout, line.encode('utf8')
示例#6
0
    def get_candidate_topic_entities(self, question):
        question = ' '.join(naive_split(question))
        tokens = self.parser.parse(question)
        entities = self.entity_linker.identify_entities_in_tokens(tokens)
        mid_to_item = dict()

        for e in entities:
            mid = e.get_mid()
            if not mid.startswith('m.'):
                mid = e.surface_name
            if mid not in mid_to_item or mid_to_item[mid][
                    'entity_score'] < e.surface_score:
                mid_to_item[mid] = dict()
                mid_to_item[mid]['topic'] = mid
                mid_to_item[mid]['mention'] = e.surface_name
                mid_to_item[mid]['entity_score'] = e.surface_score

        return question, mid_to_item.values()
示例#7
0
文件: predict.py 项目: mindis/KBQA
    def tag_top2(self, sentence):
        if self.use_part_of_speech:
            sentence, poss = self.get_pos_tag(sentence)
        else:
            sentence = naive_split(sentence)
            poss = None
        data = self.dataset.create_model_input(sentence, poss)

        viterbi_sequences, scores = self.model.predict_top_k(
            data['sentence_lengths'],
            data['word_ids'],
            data['char_for_ids'],
            data['char_rev_ids'],
            data['word_lengths'],
            data['cap_ids'],
            data['pos_ids'],
        )
        seq_len = data['sentence_lengths'][0]
        words = data['words'][0][:seq_len]
        mention_to_likelihood = dict()
        for k in range(2):
            if k == 1 and scores[0][1] * 1.0 / scores[0][0] < 0.95:
                break
            viterbi_sequence_ = viterbi_sequences[0][k]
            likelihood = self.get_sequence_likelihood(data,
                                                      [viterbi_sequence_])[0]

            pred_entities, pred_tag_sequence = self.dataset.get_mention_from_words(
                words, viterbi_sequence_)
            for e in pred_entities:
                if e not in mention_to_likelihood:
                    mention_to_likelihood[e] = likelihood

        res = dict()
        res['mentions'] = mention_to_likelihood
        res['sentence'] = ' '.join(sentence)
        if poss:
            res['pos'] = poss
        return res
示例#8
0
文件: predict.py 项目: mindis/KBQA
 def get_mention_likelihood(self, question, mention):
     if self.use_part_of_speech:
         sentence, poss = self.get_pos_tag(question)
     else:
         sentence = naive_split(question)
         poss = None
     mention = mention.split()
     data = self.dataset.create_model_input(sentence, poss)
     start = find_word(sentence, mention)
     end = start + len(mention)
     tag_ids = self.dataset.create_tag_sequence(start, end, len(sentence),
                                                self.tag_scheme)
     scores = self.model.get_likelihood(
         tag_ids,
         data['sentence_lengths'],
         data['word_ids'],
         data['char_for_ids'],
         data['char_rev_ids'],
         data['word_lengths'],
         data['cap_ids'],
         data['pos_ids'],
     )
     return question, scores.tolist()[0]
示例#9
0
def gen_data_for_relation_matcher(fn_webquestion_list, fn_simplequstion_list,
                                  fn_out):
    pipeline = Pipeline()

    # symbols = {"_", "_'s", "_'"}

    def map_word(x):
        if x.startswith('_') or x.endswith('_'):
            return x.replace('_', "<$>")
        else:
            return x

    with open(fn_out, 'w') as fout:
        # process simple question
        for fn in fn_simplequstion_list:
            with open(fn) as fin:
                for line in fin:
                    _, positive_relation, _, pattern, question = line.decode(
                        'utf8').strip().split('\t')
                    # question, candidate_relations = pipeline.gen_candidate_relations(question)
                    pattern = ' '.join(
                        naive_split(' '.join(
                            [map_word(w) for w in pattern.split()])))
                    # negative_relations = candidate_relations - {positive_relation}
                    print >> fout, json.dumps(
                        {
                            "question": pattern,
                            "pos_relation": [positive_relation],
                            # "neg_relation": list(negative_relations)},
                            "neg_relation": []
                        },
                        ensure_ascii=False).encode('utf8')

        # process webquestion
        for fn in fn_webquestion_list:
            webquestion = json.load(open(fn), encoding="utf8")
            for data in webquestion:
                positive_relations = set()
                for path in data['paths']:
                    if path[1] not in {
                            "forward_pass_non_cvt", "forward_pass_cvt",
                            "forward_direct"
                    }:
                        raise ValueError('path type error')
                    if path[1] == "forward_pass_cvt" or path[
                            1] == "forward_direct":
                        positive_relations.add(path[0].split()[-2])
                if len(positive_relations) == 0:
                    continue
                pattern = ' '.join(
                    naive_split(' '.join(
                        [map_word(w) for w in data['sentence'].split()])))

                question, candidate_relations = pipeline.gen_candidate_relations(
                    data['utterance'])
                negative_relations = candidate_relations - positive_relations
                print >> fout, json.dumps(
                    {
                        "question": pattern,
                        "pos_relation": list(positive_relations),
                        "neg_relation": list(negative_relations)
                    },
                    ensure_ascii=False).encode('utf8')
示例#10
0
文件: predict.py 项目: mindis/KBQA
 def get_pos_tag(self, sentence):
     sentence = naive_split(sentence)
     tokens, poss = self.nlp_parser.tag_pos(' '.join(sentence))
     tokens = [self.fix(t) for t in tokens]
     return tokens, poss