def stanford_parser(sentences, proc_id=0): # print('Im stanford parsing process'+proc_id) if os.path.exists('data/.tmp'+proc_id): shutil.rmtree('data/.tmp'+proc_id) os.makedirs('data/.tmp'+proc_id) # create a temperal dir for parsing large paragraphs for sent_id in sentences.keys(): f = io.open(os.path.join('data/.tmp'+proc_id, sent_id), 'w', -1, 'utf-8') f.write(sentences[sent_id]) f.close() parsing_result = list(batch_parse('data/.tmp'+proc_id, os.path.join(os.environ['CN_SF_PATH'], 'externals/stanford-corenlp-full-2014-08-27/'), properties="StanfordCoreNLP-chinese.properties", )) result = dict() for r in parsing_result: result[r['file_name']] = r['sentences'][0] return result
def stanford_parser(sentences, proc_id=0): # print('Im stanford parsing process'+proc_id) if os.path.exists('data/.tmp'+proc_id): shutil.rmtree('data/.tmp'+proc_id) os.makedirs('data/.tmp'+proc_id) # create a temperal dir for parsing large paragraphs for sent_id in sentences.keys(): f = io.open(os.path.join('data/.tmp'+proc_id, sent_id), 'w', -1, 'utf-8') f.write(sentences[sent_id]) f.close() stanford_parser_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../externals/stanford-corenlp-full-2014-08-27/') parsing_result = list(batch_parse('data/.tmp'+proc_id, stanford_parser_dir, properties=os.path.join(stanford_parser_dir, "StanfordCoreNLP-chinese.properties") )) result = dict() for r in parsing_result: result[r['file_name']] = r['sentences'][0] return result
def evidence_extaction(self): # ************* batch segment long article ************* # start = time.time() if os.path.exists('data/.tmp/'): shutil.rmtree('data/.tmp') os.makedirs('data/.tmp/') # create a temperal dir for parsing large paragraphs for doc_id in self.cleaned_docs: f = io.open(os.path.join('data/.tmp', doc_id), 'w', -1, 'utf-8') f.write(self.cleaned_docs[doc_id]) f.close() # run stanford segmenter stanford_nlp_dir = os.path.join(self.CN_SF_PATH, 'externals/stanford-corenlp-full-2014-08-27/') segmenter_result = list(batch_parse('data/.tmp/', stanford_nlp_dir, properties=os.path.join(stanford_nlp_dir, "StanfordCoreNLP-chinese.Segmenter.properties") )) for r in segmenter_result: self.segmented_docs[r['file_name']] = r['sentences'] print('segmenting time cost '+str(time.time()-start)) # cpickle for development # cPickle.dump(self.segmented_docs, open('data/segmented_docs.pkl', 'wb')) # self.segmented_docs = cPickle.load(open('data/segmented_docs.pkl', 'rb')) # ************* select evidence ************* # sent_to_parse = dict() self.evidence = OrderedDict() for query in self.queries: print('\textracting ' + query.name) evidences = OrderedDict() # {slot_type: sentence_parsed_result} for doc_id in self.query_docs[query.id].keys(): seg_result = self.segmented_docs[doc_id] for i in xrange(len(seg_result)): # sentence is stanford standard format output sentence = seg_result[i] sent_id = '|'.join([doc_id, str(i)]) # if sentence is too long or too short, it carries less dependency information if len(sentence['words']) > 130 or len(sentence['words']) < 3: continue sent_text = ''.join(sentence['text']) # *************** check if this sentence is an evidence ******************** # # ============== common case ============= # seg_sent_text = sentence['text'] # list of tokens seg_sent_text = [jianfan.ftoj(w) for w in seg_sent_text] # here joining s['text'] list will overcome segmentation errors if query.name not in ''.join(seg_sent_text): continue triggers = self.triggers[query.entity_type] if query.entity_type == 'PER': slot_types = self.PER_SLOT_TYPE elif query.entity_type == 'ORG': slot_types = self.ORG_SLOT_TYPE for slot_type in slot_types: if slot_type not in evidences.keys(): evidences[slot_type] = [] for t in triggers[slot_type]: # compare triggers to words by segmentation, might affected by segmentation errors if t not in seg_sent_text: continue evidences[slot_type].append(Evidence(doc_id, query.id, t, sent_text, sent_id)) sent_to_parse[sent_id] = sent_text # add sentence and do parallel parsing later. # ============== special case ============== # if query.entity_type == 'PER': evidences['per:alternate_names'].append(Evidence(doc_id, query.id, '', sent_text, sent_id, sentence)) if query.entity_type == 'ORG': # for org:alternate_names, the article contains the query is evidence, for pattern match evidences['org:alternate_names'].append(Evidence(doc_id, query.id, '', sent_text, sent_id, sentence)) # for org:XXX_headquarters, the article contains the query is evidence, for pattern match evidences['org:country_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) evidences['org:stateorprovince_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) evidences['org:city_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) self.evidence[query.id] = evidences # *************** parallel parsing ****************** # def chunkIt(seq, num): avg = len(seq) / float(num) out = [] last = 0.0 while last < len(seq): out.append(seq[int(last):int(last + avg)]) last += avg return out # run stanford parser in multiprocessing process_num = multiprocessing.cpu_count() / 2 if multiprocessing.cpu_count() / 2 < 10 else 10 p = multiprocessing.Pool(processes=process_num) chunked_sent = [dict(item) for item in chunkIt(sent_to_parse.items(), process_num)] mp_result = [p.apply_async(stanford_parser, args=(chunked_sent[i], str(i))) for i in range(process_num)] mp_result = [p.get() for p in mp_result] sent_parsing_result = {} for r in mp_result: sent_parsing_result.update(r) # cpickle for development # cPickle.dump(sent_parsing_result, open('data/sent_parsing_result.pkl', 'wb')) # sent_parsing_result = cPickle.load(open('data/sent_parsing_result.pkl', 'rb')) # updating evidences for q_id in self.evidence.keys(): evidences = self.evidence[q_id] for slot_type in evidences.keys(): for e in evidences[slot_type]: if not e.trigger: continue e.parse_result = sent_parsing_result[e.sent_id] # *************** correct segmenter error ******************** # china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb')) province_city_list = [] for p in china_province_city: province_city_list += [p['name']] for c in p['sub']: province_city_list += [c['name']] if p['type'] == 0: continue for d in c['sub']: province_city_list += [d['name']] for q_id in self.evidence.keys(): for slot_type in self.evidence[q_id]: for i in xrange(len(self.evidence[q_id][slot_type])): self.evidence[q_id][slot_type][i] = self.correct_evidence(self.find_query(q_id).name, self.evidence[q_id][slot_type][i]) for p_or_c in province_city_list: if len(p_or_c) > 2 and p_or_c in \ ''.join(self.evidence[q_id][slot_type][i].parse_result['text']): self.evidence[q_id][slot_type][i] = \ self.correct_evidence(p_or_c, self.evidence[q_id][slot_type][i]) print('Done')