Пример #1
0
def stanford_parser(sentences, proc_id=0):
    # print('Im stanford parsing process'+proc_id)
    if os.path.exists('data/.tmp'+proc_id):
        shutil.rmtree('data/.tmp'+proc_id)
    os.makedirs('data/.tmp'+proc_id)  # create a temperal dir for parsing large paragraphs

    for sent_id in sentences.keys():
        f = io.open(os.path.join('data/.tmp'+proc_id, sent_id), 'w', -1, 'utf-8')
        f.write(sentences[sent_id])
        f.close()
    parsing_result = list(batch_parse('data/.tmp'+proc_id,
                                      os.path.join(os.environ['CN_SF_PATH'],
                                                   'externals/stanford-corenlp-full-2014-08-27/'),
                                      properties="StanfordCoreNLP-chinese.properties",
                                      ))
    result = dict()
    for r in parsing_result:
        result[r['file_name']] = r['sentences'][0]

    return result
Пример #2
0
def stanford_parser(sentences, proc_id=0):
    # print('Im stanford parsing process'+proc_id)
    if os.path.exists('data/.tmp'+proc_id):
        shutil.rmtree('data/.tmp'+proc_id)
    os.makedirs('data/.tmp'+proc_id)  # create a temperal dir for parsing large paragraphs

    for sent_id in sentences.keys():
        f = io.open(os.path.join('data/.tmp'+proc_id, sent_id), 'w', -1, 'utf-8')
        f.write(sentences[sent_id])
        f.close()
    stanford_parser_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                       '../../externals/stanford-corenlp-full-2014-08-27/')
    parsing_result = list(batch_parse('data/.tmp'+proc_id,
                                      stanford_parser_dir,
                                      properties=os.path.join(stanford_parser_dir, "StanfordCoreNLP-chinese.properties")
                                      ))
    result = dict()
    for r in parsing_result:
        result[r['file_name']] = r['sentences'][0]

    return result
Пример #3
0
    def evidence_extaction(self):
        # ************* batch segment long article ************* #
        start = time.time()
        if os.path.exists('data/.tmp/'):
            shutil.rmtree('data/.tmp')
        os.makedirs('data/.tmp/')  # create a temperal dir for parsing large paragraphs
        for doc_id in self.cleaned_docs:
            f = io.open(os.path.join('data/.tmp', doc_id), 'w', -1, 'utf-8')
            f.write(self.cleaned_docs[doc_id])
            f.close()

        # run stanford segmenter
        stanford_nlp_dir = os.path.join(self.CN_SF_PATH,
                                        'externals/stanford-corenlp-full-2014-08-27/')
        segmenter_result = list(batch_parse('data/.tmp/',
                                            stanford_nlp_dir,
                                            properties=os.path.join(stanford_nlp_dir,
                                                                    "StanfordCoreNLP-chinese.Segmenter.properties")
                                            ))
        for r in segmenter_result:
            self.segmented_docs[r['file_name']] = r['sentences']
        print('segmenting time cost '+str(time.time()-start))

        # cpickle for development
        # cPickle.dump(self.segmented_docs, open('data/segmented_docs.pkl', 'wb'))
        # self.segmented_docs = cPickle.load(open('data/segmented_docs.pkl', 'rb'))

        # ************* select evidence ************* #
        sent_to_parse = dict()

        self.evidence = OrderedDict()
        for query in self.queries:
            print('\textracting ' + query.name)

            evidences = OrderedDict()  # {slot_type: sentence_parsed_result}
            for doc_id in self.query_docs[query.id].keys():
                seg_result = self.segmented_docs[doc_id]
                for i in xrange(len(seg_result)):  # sentence is stanford standard format output
                    sentence = seg_result[i]
                    sent_id = '|'.join([doc_id, str(i)])
                    # if sentence is too long or too short, it carries less dependency information
                    if len(sentence['words']) > 130 or len(sentence['words']) < 3:
                        continue

                    sent_text = ''.join(sentence['text'])

                    # *************** check if this sentence is an evidence ******************** #
                    # ============== common case ============= #
                    seg_sent_text = sentence['text']  # list of tokens
                    seg_sent_text = [jianfan.ftoj(w) for w in seg_sent_text]

                    # here joining s['text'] list will overcome segmentation errors
                    if query.name not in ''.join(seg_sent_text):
                        continue

                    triggers = self.triggers[query.entity_type]

                    if query.entity_type == 'PER':
                        slot_types = self.PER_SLOT_TYPE
                    elif query.entity_type == 'ORG':
                        slot_types = self.ORG_SLOT_TYPE

                    for slot_type in slot_types:
                        if slot_type not in evidences.keys():
                            evidences[slot_type] = []
                        for t in triggers[slot_type]:
                            # compare triggers to words by segmentation, might affected by segmentation errors
                            if t not in seg_sent_text:
                                continue
                            evidences[slot_type].append(Evidence(doc_id, query.id, t, sent_text, sent_id))
                            sent_to_parse[sent_id] = sent_text  # add sentence and do parallel parsing later.

                    # ============== special case ============== #
                    if query.entity_type == 'PER':
                        evidences['per:alternate_names'].append(Evidence(doc_id, query.id, '',
                                                                         sent_text, sent_id, sentence))

                    if query.entity_type == 'ORG':
                        # for org:alternate_names, the article contains the query is evidence, for pattern match
                        evidences['org:alternate_names'].append(Evidence(doc_id, query.id, '',
                                                                         sent_text, sent_id, sentence))

                        # for org:XXX_headquarters, the article contains the query is evidence, for pattern match
                        evidences['org:country_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                                  sent_text, sent_id, sentence)))
                        evidences['org:stateorprovince_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                                          sent_text, sent_id, sentence)))
                        evidences['org:city_of_headquarters'].append((Evidence(doc_id, query.id, '',
                                                                               sent_text, sent_id, sentence)))

            self.evidence[query.id] = evidences

        # *************** parallel parsing ****************** #
        def chunkIt(seq, num):
            avg = len(seq) / float(num)
            out = []
            last = 0.0

            while last < len(seq):
                out.append(seq[int(last):int(last + avg)])
                last += avg

            return out

        # run stanford parser in multiprocessing
        process_num = multiprocessing.cpu_count() / 2 if multiprocessing.cpu_count() / 2 < 10 else 10
        p = multiprocessing.Pool(processes=process_num)
        chunked_sent = [dict(item) for item in chunkIt(sent_to_parse.items(), process_num)]
        mp_result = [p.apply_async(stanford_parser,
                                   args=(chunked_sent[i], str(i))) for i in range(process_num)]
        mp_result = [p.get() for p in mp_result]
        sent_parsing_result = {}
        for r in mp_result:
            sent_parsing_result.update(r)

        # cpickle for development
        # cPickle.dump(sent_parsing_result, open('data/sent_parsing_result.pkl', 'wb'))
        # sent_parsing_result = cPickle.load(open('data/sent_parsing_result.pkl', 'rb'))

        # updating evidences
        for q_id in self.evidence.keys():
            evidences = self.evidence[q_id]
            for slot_type in evidences.keys():
                for e in evidences[slot_type]:
                    if not e.trigger:
                        continue
                    e.parse_result = sent_parsing_result[e.sent_id]

        # *************** correct segmenter error ******************** #
        china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb'))
        province_city_list = []
        for p in china_province_city:
            province_city_list += [p['name']]
            for c in p['sub']:
                province_city_list += [c['name']]
                if p['type'] == 0:
                    continue
                for d in c['sub']:
                    province_city_list += [d['name']]

        for q_id in self.evidence.keys():
            for slot_type in self.evidence[q_id]:
                for i in xrange(len(self.evidence[q_id][slot_type])):
                    self.evidence[q_id][slot_type][i] = self.correct_evidence(self.find_query(q_id).name,
                                                                              self.evidence[q_id][slot_type][i])
                    for p_or_c in province_city_list:
                        if len(p_or_c) > 2 and p_or_c in \
                                ''.join(self.evidence[q_id][slot_type][i].parse_result['text']):
                            self.evidence[q_id][slot_type][i] = \
                                self.correct_evidence(p_or_c, self.evidence[q_id][slot_type][i])

        print('Done')