def get_match_result(patterns, pattern2attributes, tokens): ''' use the final patterns to match the sent,if ';' exist in the sent, we cut the sent first, otherwise we directly depend the pattern and value position to get the attributes and values patterns: the final patterns that used in sentence ''' attributes2value = {} seg_token = Token(';', ':') if contain_token(seg_token, tokens): chips = cut_token_list(tokens, [seg_token]) for chip in chips: value_pos = get_value_pos(patterns, chip) logger.info(str(patterns) + 'value_pos' + str(value_pos)) for i in range(len(patterns) - 1, -1, -1): sent = "" end = value_pos[i][1] slice_chip = chip[value_pos[i][0]:end] for token in slice_chip: if token.word in ['.', ';', ',']: sent = sent.strip() + token.word + " " else: sent += token.word + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] else: value_pos = get_value_pos(patterns, tokens) logger.info(str(patterns) + ' value_pos: ' + str(value_pos)) # print 'value_pos',value_pos for i in range(len(patterns) - 1, -1, -1): # print i,patterns[i] sent = "" end = value_pos[i][1] # print value_pos[i][0],end slice_sent_pos = tokens[value_pos[i][0]:end] for token in slice_sent_pos: if token.word in ['.', ';', ',']: sent = sent.strip() + token.word + " " else: sent += token.word + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] return attributes2value
def get_fix(size, attribute_value_tokens, definition_tokens): chunks = cut_token_list(attribute_value_tokens, [Token(';', ':')]) prefixs2intersect = [] for chunk in chunks: prefixs2intersect.extend(get_fix_chunk(size, chunk, definition_tokens)) return prefixs2intersect
def get_tokens(pos_words): tokens = [] for word2tag in pos_words: token = Token(word2tag[0], word2tag[1]) tokens.append(token) return tokens
def process_definition(definition, pattern2attrubute): attributes2value = {} logger.info('definition: %s' % definition) if definition.strip().startswith('See') or definition.strip().startswith( 'see'): process_vacant_definition(definition) start = datetime.datetime.now() text = nltk.word_tokenize(definition) def_pos = nltk.pos_tag(text) def_tokens = pos_word2tokens(def_pos) logger.info(def_pos) end = datetime.datetime.now() global tag_time_all tag_time_all += (end - start).microseconds logger.info('tagging time:%d ' % ((end - start).microseconds)) # logger.info(def_pos) # seg_point=[('.','.'),(';',':')] seg_point_token = [Token('.', '.'), Token(';', ':')] sents_tokens = cut_token_list(def_tokens, seg_point_token) start = datetime.datetime.now() end = datetime.datetime.now() time_find_candidate_pattern = (end - start).microseconds time_choice_final_pattern = (end - start).microseconds time_get_match_result = (end - start).microseconds for sent_tokens in sents_tokens: # sent_pos=[] # for token in sent_tokens: # sent_pos.append(token.show()) # logger.info("sent_pos: "+str(sent_pos)) start = datetime.datetime.now() candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(), sent_tokens) end = datetime.datetime.now() time_find_candidate_pattern += (end - start).microseconds logger.info('find candidate pattern time: ' + str((end - start).microseconds)) logger.info("candidate_patterns: " + str(candidate_patterns)) if len(candidate_patterns) == 0: continue start = datetime.datetime.now() choiced_patterns = choice_final_pattern(candidate_patterns, sent_tokens) end = datetime.datetime.now() time_choice_final_pattern += (end - start).microseconds logger.info('choice final pattern time: ' + str((end - start).microseconds)) logger.info("choiced_patterns: " + str(choiced_patterns)) start = datetime.datetime.now() attributes2value_part = get_match_result(choiced_patterns, pattern2attrubute, sent_tokens) for attribute, value in attributes2value_part.iteritems(): if attribute in attributes2value.keys(): part1 = attributes2value[attribute] attributes2value[attribute] = part1 + '; ' + value else: attributes2value[attribute] = value end = datetime.datetime.now() time_get_match_result += (end - start).microseconds logger.info('get match result time: ' + str((end - start).microseconds)) logger.info("attributes2value: " + str(attributes2value)) global find_candidate_time find_candidate_time += time_find_candidate_pattern logger.info('time_find_candidate_pattern: ' + str(time_find_candidate_pattern)) logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern)) logger.info('time_get_match_result: ' + str(time_get_match_result)) logger.info("whole attributes2value: " + str(attributes2value)) return attributes2value
def pos_word2tokens(pos_words): tokens = [] for pos_word in pos_words: tokens.append(Token(pos_word[0], pos_word[1])) return tokens