Пример #1
0
    def retrieve_query_doc(self):
        for q in self.queries:
            q_results = dict()
            print(q.name + '...'),
            found_doc_path = search(q.name, self.lucene_searcher, self.lucene_analyzer, 3000)
            for doc_path in found_doc_path:
                doc_id = doc_path.split('/')[-1].strip()
                doc_content = io.open(doc_path, 'r', -1, 'utf-8').read()
                if doc_content.replace(' ', '').replace('\n', '').count(q.name) < 4:
                    continue
                q_results[doc_id] = doc_content

                # clean doc
                cleaned_doc = remove_doc_noise(doc_content)  # remove tags like datetime, headline, dateline, etc.
                cleaned_doc = remove_xml_tag(cleaned_doc)
                cleaned_doc = remove_space_linebreak(cleaned_doc)

                # create offset mapping table betwenn clean doc and origin doc
                if doc_id in self.doc_mapping_table.keys():
                    continue
                offset_mapping_table = OrderedDict()
                cleaned_doc_index = 0
                origin_doc_index = 0
                for char in cleaned_doc:
                    while True:
                        if char != doc_content[origin_doc_index]:
                            origin_doc_index += 1
                        else:
                            offset_mapping_table[cleaned_doc_index] = origin_doc_index
                            cleaned_doc_index += 1
                            origin_doc_index += 1
                            break
                # check correctness of offset mapping table
                for index in offset_mapping_table.keys():
                    assert cleaned_doc[index] == doc_content[offset_mapping_table[index]]

                self.doc_mapping_table[doc_id] = offset_mapping_table
                self.cleaned_docs[doc_id] = cleaned_doc

            self.query_docs[q.id] = q_results

            print('Done')
Пример #2
0
    def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher,
                                self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province + city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output + [l]
Пример #3
0
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher,
                                self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country + state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output + [l, province]

        return current_output + [l]
Пример #4
0
    def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province+city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output+[l]
Пример #5
0
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country+state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output+[l, province]

        return current_output+[l]