def test(fun): result = [] for item in data_modified: pos2defs = item["pos2definition"] for pos2def in pos2defs: def_tagged_str = pos2def["def_tagged"] def_tagged = POSfromstring(def_tagged_str) chunks = cut_tuple_list(def_tagged, [('.', '.')]) for chunk in chunks: result.extend(fun(chunk)) sorted_re = sorted(Counter(result).iteritems(), key=lambda asd: asd[1], reverse=True) cnt_1 = 0 cnt_2 = 0 cnt_3 = 0 for item in sorted_re: logger.info(item) print item if item[1] > 1: cnt_1 += 1 if item[1] > 2: cnt_2 += 1 if item[1] > 3: cnt_3 += 1 print len(sorted_re), cnt_1, cnt_2, cnt_3
def extract_single_item(data, i, new_data, pattern2attrubute, stanford_tagger): print i, 'start' pos2definition = data[i]["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) attributes2value = process_definition(definition_pure, pattern2attrubute, stanford_tagger) pos2def["attributes"] = attributes2value logger.info("\n\n") print i, ' over' new_data.append(data[i])
def get_start_pos(pattern, sent_pos): pattern_start = KMP_match(pattern, sent_pos) pattern_words = pattern[:pattern.index("$")].split('+') cur = int(re.findall('\$(\d+)', pattern)[0]) if cur > 0: logger.info("start move right %d" % cur) start_tmp = pattern_start + len(pattern_words) - cur start = 0 if sent_pos[start_tmp][0] in [',']: start = start_tmp + 1 else: start = start_tmp return start
def get_end_pos(end_current, sent_pos): cur = 0 for i in range(end_current - 1, 0, -1): if sent_pos[i][1] not in [ 'DT', 'CC', 'TO', 'WDT', 'IN', 'RB' ] and sent_pos[i][0] not in [ 'be', 'is', 'are', 'that', 'may', 'can', 'performed', ',' ]: break else: cur += 1 if cur > 0: logger.info("end move left %d" % cur) return end_current - cur
def choice_final_pattern(patterns, sent_pos): ''' we have two comparing principles: if two patterns occur in a same position, the bigger range the prior higher; if two pattern have common part or common match in a sentence, occur earlier prior higher ''' # pos2patterns,pattern2range=get_pos_patterns_range(patterns,sent_pos) pos2patterns = get_pos2patterns(patterns, sent_pos) sort_pos2patterns = sorted(pos2patterns.iteritems(), key=lambda d: d[0], reverse=False) logger.info('sorted_pattern:¡¡' + str(sort_pos2patterns)) pattern2range = get_pattern_range(patterns, sent_pos) logger.info('pattern2range:¡¡' + str(pattern2range)) patterns_sort_by_range = get_prior_by_range(sort_pos2patterns, pattern2range) logger.info('patterns_range: ' + str(patterns_sort_by_range)) patterns_final = get_prior_by_priority(patterns_sort_by_range, pattern2range) logger.info('patterns_final: ' + str(patterns_final)) # patterns_by_priority=get_prior_by_priority(patterns_sort_by_range,pattern2range) # logger.info('patterns_by_priority: '+str(patterns_by_priority)) # # patterns_final=get_prior_by_pos(patterns_sort_by_range,pattern2range) # logger.info('patterns_final: '+str(patterns_final)) return patterns_final
def get_match_result(patterns, pattern2attributes, sent_pos): ''' use the final patterns to match the sent,if ';' exist in the sent, we cut the sent first, otherwise we directly depend the pattern and value position to get the attributes and values patterns: the final patterns that used in sentence ''' attributes2value = {} if (';', ':') in sent_pos and not (sent_pos[0][0] == 'See' and sent_pos[1][0] == 'also'): chips = cut_list(sent_pos, [(';', ':')]) for chip in chips: value_pos = get_value_pos(patterns, chip) logger.info(str(patterns) + 'value_pos' + str(value_pos)) for i in range(len(patterns) - 1, -1, -1): sent = "" end = value_pos[i][1] slice_chip = chip[value_pos[i][0]:end] for word_tag in slice_chip: if word_tag[0] in ['.', ';', ',']: sent = sent.strip() + word_tag[0] + " " else: sent += word_tag[0] + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] else: value_pos = get_value_pos(patterns, sent_pos) logger.info(str(patterns) + ' value_pos: ' + str(value_pos)) # print 'value_pos',value_pos for i in range(len(patterns) - 1, -1, -1): # print i,patterns[i] sent = "" end = value_pos[i][1] # print value_pos[i][0],end slice_sent_pos = sent_pos[value_pos[i][0]:end] for word_tag in slice_sent_pos: if word_tag[0] in ['.', ';', ',']: sent = sent.strip() + word_tag[0] + " " else: sent += word_tag[0] + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] return attributes2value
def pre_process(): cnt_exp = 0 for item in data_tagged_modified: # for item in data_modified_test: # for item in data_modified: pos2definition = item["pos2definition"] for pos2def in pos2definition: try: definition = pos2def["definition"] definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) tokens = nltk.word_tokenize(definition_pure.encode('utf-8')) for token in tokens: # try: if chardet.detect(token)['encoding'] != 'ascii': # print token,chardet.detect(token)['encoding'],token.decode('utf-8').encode('gbk') logger.info("%s\t%s\t%s" % (token, chardet.detect(token)['encoding'], token.decode("utf-8").encode("gbk"))) # except Exception: # traceback.print_exc() # print token except Exception: cnt_exp += 1 traceback.print_exc() # print traceback.format_exc() print definition # print definition.encode('gbk') # # # print definition_pure.encode('gbk') # try: # pos2def["definition"]=pos2def["definition"].decode("utf-8").encode("gbk") # except Exception: # cnt_exp+=1 # traceback.print_exc() # print pos2def["definition"] print cnt_exp path_tagged_output = "items_tagged_modified_pre.json" json.dump(data_tagged_modified, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2)
def extract_items_single_thread(data, pattern2attrubute): data_new = [] all_time = 0 # attributes=set([]) for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: def_tagged = tagfromstring(pos2def["def_tagged"]) # definition=pos2def["definition"] # tagged_text=stanford_tagger.tag(definition.split()) # definition_pure=re.sub(r'\([\s\S]*?\)', "", definition) # cnt+=1 start = datetime.datetime.now() # for key in pos2def["attributes"]: # attributes.add(key) attributes2value = process_definition(pattern2attrubute, def_tagged) end = datetime.datetime.now() all_time += (end - start).seconds * 1000 + (end - start).microseconds logger.info('process_definition time: %ds: %dms ' % ((end - start).seconds, (end - start).microseconds)) pos2def["attributes"] = attributes2value logger.info("\n\n") data_new.append(item) # for attribue in sorted(list(attributes)): # print attribue global tag_time_all logger.info("tag all time is: %d" % tag_time_all) global find_candidate_time logger.info("find candidate time is: %d" % find_candidate_time) logger.info("all time is: %d" % all_time) return data_new
def IE_multi_thread(): path_data = "data" + os.sep + "items_tagged_modified.json" path_pattern = "patterns.json" # path_data= path_project+os.sep+"input"+os.sep+"items.json" # path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json" path_tagged_output = "items_tagged_auto.json" pattern2attrubute = json.load(codecs.open(path_pattern, encoding='UTF-8')) logger.info("loaded all the patterns") data = json.load(codecs.open(path_data, encoding='UTF-8')) logger.info("loaded all the data") data_new = extractor_multi_thread(data, pattern2attrubute) # data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger) logger.info("has extracted all the attributes") json.dump(data_new, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2) logger.info("output over")
def IE_auto_pattern(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json" # path_pattern= path_project+os.sep+"output"+os.sep+"pattern_auto_test.json" # path_pattern= path_project+os.sep+"input"+os.sep+"patterns_target.json" # path_pattern= path_project+os.sep+"input"+os.sep+"patterns_merge.json" # path_data= path_project+os.sep+"input"+os.sep+"items.json" # path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json" path_tagged_output = path_project + os.sep + "output" + os.sep + "items_tagged_auto.json" pattern2attrubute = json.load(codecs.open(path_pattern, encoding='UTF-8')) logger.info("loaded all the patterns") data = json.load(codecs.open(path_data, encoding='UTF-8')) logger.info("loaded all the data") data_new = extract_items_single_thread(data, pattern2attrubute) # data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger) logger.info("has extracted all the attributes") json.dump(data_new, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2) logger.info("output over")
elif lines[i].strip() == '~~~~~~~': items.append(itemTmp) itemTmp = [] else: sent = tagfromstring(lines[i]) defTmp.append(sent) return items # path_data="data"+os.sep+"items_tagged_modified.json" path_data_tagged_modified = "data" + os.sep + "items_tagged_modified_POS.json" path_data_tagged_modified_extract = "out" + os.sep + "items_tagged_modified_extract.json" data_tagged_modified = json.load( codecs.open(path_data_tagged_modified, encoding='UTF-8')) logger.info("loaded all the data") path_data_modified = "data" + os.sep + "items_modified_POS.json" path_data_tagged_modified_extract = "out" + os.sep + "items_modified_extract.json" data_modified = json.load(codecs.open(path_data_modified, encoding='UTF-8')) path_data_tagged_modified_test = "data" + os.sep + "items_tagged_modified_test.json" path_data_tagged_modified_extract_test = "out" + os.sep + "items_tagged_modified_extract_test.json" data_tagged_modified_test = json.load( codecs.open(path_data_tagged_modified_test, encoding='UTF-8')) # path_data_modified_test="data"+os.sep+"items_modified_test.json" # path_data_tagged_modified_extract_test="out"+os.sep+"items_modified_extract_test.json" # data_modified_test=json.load(codecs.open(path_data_modified_test, encoding='UTF-8')) logger.info("loaded all the patterns")
def process_definition(pattern2attrubute, def_tagged): attributes2value = {} definition = ' '.join([x[0] for x in def_tagged]) logger.info('definition: %s' % definition) if definition.strip().startswith('See') or definition.strip().startswith( 'see'): return start = datetime.datetime.now() # text = nltk.word_tokenize(definition) # def_pos=nltk.pos_tag(text) logger.info(def_tagged) end = datetime.datetime.now() global tag_time_all tag_time_all += (end - start).microseconds logger.info('tagging time:%d ' % ((end - start).microseconds)) # logger.info(def_pos) seg_point = [('.', '.')] sents_pos_period = cut_list(def_tagged, seg_point) sents_pos = [] for sent_pos_period in sents_pos_period: if sent_pos_period[0][0] == 'See' and sent_pos_period[1][0] == 'also': sents_pos.append(sent_pos_period) else: sents_pos.extend(cut_list(sent_pos_period, [(';', ':')])) start = datetime.datetime.now() end = datetime.datetime.now() time_find_candidate_pattern = (end - start).microseconds time_choice_final_pattern = (end - start).microseconds time_get_match_result = (end - start).microseconds for sent_pos in sents_pos: logger.info("sent_pos: " + str(sent_pos)) start = datetime.datetime.now() candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(), sent_pos) end = datetime.datetime.now() time_find_candidate_pattern += (end - start).microseconds logger.info('find candidate pattern time: ' + str((end - start).microseconds)) logger.info("candidate_patterns: " + str(candidate_patterns)) if len(candidate_patterns) == 0: continue start = datetime.datetime.now() choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos) end = datetime.datetime.now() time_choice_final_pattern += (end - start).microseconds logger.info('choice final pattern time: ' + str((end - start).microseconds)) logger.info("choiced_patterns: " + str(choiced_patterns)) start = datetime.datetime.now() attributes2value_part = get_match_result(choiced_patterns, pattern2attrubute, sent_pos) for attribute, value in attributes2value_part.iteritems(): if attribute in attributes2value.keys(): part1 = attributes2value[attribute] attributes2value[attribute] = part1 + '; ' + value else: attributes2value[attribute] = value end = datetime.datetime.now() time_get_match_result += (end - start).microseconds logger.info('get match result time: ' + str((end - start).microseconds)) logger.info("attributes2value: " + str(attributes2value)) global find_candidate_time find_candidate_time += time_find_candidate_pattern logger.info('time_find_candidate_pattern: ' + str(time_find_candidate_pattern)) logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern)) logger.info('time_get_match_result: ' + str(time_get_match_result)) logger.info("whole attributes2value: " + str(attributes2value)) return attributes2value
# print definition.encode('gbk') # # # print definition_pure.encode('gbk') # try: # pos2def["definition"]=pos2def["definition"].decode("utf-8").encode("gbk") # except Exception: # cnt_exp+=1 # traceback.print_exc() # print pos2def["definition"] print cnt_exp path_tagged_output = "items_tagged_modified_pre.json" json.dump(data_tagged_modified, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2) if __name__ == '__main__': start = datetime.datetime.now() logger.info(start) print start # IE_auto_pattern() # tagged_def() # IE() pre_process() end = datetime.datetime.now() logger.info(end) print end print(end - start).seconds, (end - start).microseconds logger.info("cost time: " + str((end - start).seconds))