def find_candidate_pattern(patterns, sent_pos): candidate_patterns = [] for pattern in patterns: # start=datetime.datetime.now() if is_candidate_pattern(pattern, sent_pos): logger.info(pattern + ": is candidate") candidate_patterns.append(pattern) # end=datetime.datetime.now() # print (end-start).microseconds return candidate_patterns
def getVBContext(attribute2prefixList): VBContextList = [] # for item in attribute2prefixList.iteritems(): # for prefix in item[1]: prefixList = [] for attribute, prefix in attribute2prefixList.iteritems(): prefixList.extend(prefix) prefixListContext = [] for prefix in prefixList: POSList = prefix[0].split('+') VBPos = indexVB(POSList) if VBPos == -1: continue if VBPos >= len(POSList) - 2: if len(POSList) <= 3: context = '+'.join(POSList[:]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) else: context = '+'.join(POSList[len(POSList) - 3:]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) elif VBPos == len(POSList) - 3: if len(POSList) <= 4: context = '+'.join(POSList[:-1]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) else: context = '+'.join(POSList[len(POSList) - 4:-1]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) elif VBPos == len(POSList) - 4: if len(POSList) <= 5: context = '+'.join(POSList[:-2]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) else: context = '+'.join(POSList[len(POSList) - 5:-2]) VBContextList.append(context) prefixListContext.append( (context, prefix[0], prefix[1], prefix[2])) logger.info('\n----------------VBContext num: ------------------') c_VBContext = Counter(VBContextList) for item in sorted(c_VBContext.iteritems(), key=lambda asd: asd[1], reverse=True): logger.info(item[0] + ': ' + str(item[1])) logger.info('\n----------------VBContext seq: ------------------') for tup in sorted(prefixListContext, key=lambda asd: asd[0], reverse=False): logger.info(tup)
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = "items_tagged_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json" path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json" path_pattern = "Patterns_auto.json" path_pattern_sorted = "pattern_auto_sorted_by_attribute_pattern.json" # path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json" data = load_json(path_data) logger.info("loaded all the data") #acquire all the pattern candidates attribute2PSFSList = acquire_patterns(data) # getVBContext(attribute2PSFSList) #filter the pattern candidates PASFFLSsList = filterPatterns(attribute2PSFSList) #merge the pattern candidates patterns = merge_pattern_all(PASFFLSsList) patterns_priority = calculate_proprity2pattern(patterns) logger.info("has acquired all the patterns") json.dump(patterns_priority, codecs.open(path_pattern, 'w', 'utf-8'), ensure_ascii=False, indent=2) # json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2) # dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted) sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted) logger.info("output over")
def get_final_tokens(pASFFLSsList): #合并所有可能的首尾有交叉的pattern 但是只有在同一个句子中出现才能被合并 #如果跟任何一个都不相交 就保留下来 if len(pASFFLSsList) == 1: return pASFFLSsList pASFFLSsList_new = [] visit_ele = set([]) for i in range(len(pASFFLSsList)): # logger.info('i=%d' % i) #用以判断第i个pASFFLSs和其后的pASFFLSs是否有相交 如果没有 且没有被别的pASFFLSs 比较过 ,则它自己就可以作为单独的pASFFLSs放入最终的结果 cmp_result = [] for j in range(i + 1, len(pASFFLSsList), 1): cmp_ = is_intersectPASFFW(pASFFLSsList[i], pASFFLSsList[j]) if cmp_ == -1: # logger.info((' have no intersect ',PASFFWList[i],PASFFWList[j])) continue else: visit_ele.add(i) visit_ele.add(j) pASFFLSsList_new.append(cmp_) cmp_result.append(cmp_) if len(cmp_result) == 0 and (i not in visit_ele): pASFFLSsList_new.append(pASFFLSsList[i]) # logger.info(('append: ',PASFFWList[i])) elif len(cmp_result) > 0 and max(cmp_result) == -1 and ( i not in visit_ele): pASFFLSsList_new.append(pASFFLSsList[i]) pASFFLSsList_new_RemoveDup = removeDupPASFFLSs(pASFFLSsList_new) logger.info('final merge result:') for x in pASFFLSsList_new_RemoveDup: logger.info(x) del pASFFLSsList[:] pASFFLSsList.extend(pASFFLSsList_new_RemoveDup)
def getChunkPrefix(size,attribute_value_tokens,sentTokens): match_pos=get_match_pos(attribute_value_tokens,sentTokens) can_pos_set=[] start=match_pos[0] if start==-1: logger.info('this value cannot match') # logger.info('sent:'+str((sentTokens))) # logger.info('attribute_value:'+str((attribute_value_tokens))) return [] window=[] prefixShiftSentList=[] ''' when the start<size, we fetch the window util start from 0, when the start>=size, we fetch the window util start from start-size, ''' for i in range(CYCLE): # for i in range(size): if start<size and start<len(sentTokens): can_pos_set=get_can_pos_set(start) window=sentTokens[:start] elif start>=size and start<len(sentTokens): can_pos_set=get_can_pos_set(size) window=sentTokens[start-size:start] else: continue for can_pos in can_pos_set: prefix=[] for pos in can_pos: prefix.append(window[pos]) prefixShiftSentList.append((prefix,i,sentTokens)) start=start+1 return prefixShiftSentList
def mergeNearPatterns(pASFFLSsList): #传入的PASFFWList是一个经过相同fix tokens合并的 #迭代收敛的条件是 PASFFWList的大小不变 pASFFLSsList_len = [] while True: pASFFLSsList_len.append(len(pASFFLSsList)) get_final_tokens(pASFFLSsList) if len(pASFFLSsList_len) < 3: continue else: if pASFFLSsList_len[-1] == pASFFLSsList_len[ -2] == pASFFLSsList_len[-3]: break final_pASFFLSsList_print = '' for pASFFLSs in pASFFLSsList: final_pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[ 1] + ', ' + str(pASFFLSs[2]) + ', ' + str( pASFFLSs[3]) + ', ' + str(pASFFLSs[4]) + '\n' logger.info("get the final merged pASFFLSsList: \n" + final_pASFFLSsList_print) logger.info(str(pASFFLSsList)) final_pattern_can = [] for pASFFLSs in pASFFLSsList: pattern = get_final_pattern(pASFFLSs) if pattern[:pattern.index('$')] == '': # final_pattern.extend([getWholePattern(x) for x in pASFFLSsList]) final_pattern_can.append(getWholePattern(pASFFLSs)) else: final_pattern_can.append(pattern) final_pattern_print = '' for pattern in final_pattern_can: final_pattern_print += pattern + '\n' logger.info("get the final patterns can: \n" + final_pattern_print) final_pattern = remove_dup_pattern(final_pattern_can) return final_pattern
# getVBContext(attribute2PSFSList) #filter the pattern candidates PASFFLSsList = filterPatterns(attribute2PSFSList) #merge the pattern candidates patterns = merge_pattern_all(PASFFLSsList) patterns_priority = calculate_proprity2pattern(patterns) logger.info("has acquired all the patterns") json.dump(patterns_priority, codecs.open(path_pattern, 'w', 'utf-8'), ensure_ascii=False, indent=2) # json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2) # dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted) sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted) logger.info("output over") if __name__ == '__main__': start = datetime.datetime.now() logger.info(start) print start main() # test_get_final_tokens() # test_get_final_pattern() # test_getChunkPrefix() end = datetime.datetime.now() logger.info(end) print end logger.info("cost time: " + str((end - start).microseconds)) print str((end - start).seconds) + ' s'
def filterPatterns(attribute2PSFSList): logger.info('------------------filter the patterns--------------------') attribute2PSFFLSsList = {} for attribue, PSFSList in attribute2PSFSList.iteritems(): print 'attribute: ' + attribue, "attribute length:", len(PSFSList) logger.info('final attribute: %s' % attribue) logger.info('patterns can original length: ' + str(len(PSFSList))) filterPSFFLSsList = filterGrammerWrong(PSFSList) attribute2PSFFLSsList[attribue] = filterPSFFLSsList logger.info('final patterns length %d:' % len(filterPSFFLSsList)) logger.info(str(filterPSFFLSsList) + '\n') PASFFLSsList = filterPatternNotOnly(attribute2PSFFLSsList) logger.info("final PASFFLSs: ") for PASFFLSs in PASFFLSsList: logger.info(PASFFLSs) return PASFFLSsList
def merge_pattern(pASFFLSsList): print "----------------merge patterns of attribute: %s ----------------" % pASFFLSsList[ 0][1] logger.info( "----------------merge patterns of attribute: %s ----------------" % pASFFLSsList[0][1]) #PASFFW: patternAttributeShiftFreqFixtokenWindow #将来自同一fix_tokens的pattern合并到一个list中 pASFFLSsList_print = '' for pASFFLSs in pASFFLSsList: pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[1] + ', ' + str( pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n' logger.info("---------------patterns before merging :\n%s---------------" % pASFFLSsList_print) pASFFLSsListList = [] pos_visit = [] for i in range(len(pASFFLSsList)): #如果i被访问过 则pASFFLSsList[i]就一定已经被添加到某个子list中 if i in pos_visit: continue pos_i = contain_PASFFW_by_Fixtokens(pASFFLSsList[i], pASFFLSsListList) if pos_i == -1: pattern_can_i = [] pattern_can_i.append(pASFFLSsList[i]) pos_visit.append(i) pASFFLSsListList.append(pattern_can_i) pos_i = contain_PASFFW_by_Fixtokens(pASFFLSsList[i], pASFFLSsListList) for j in range(i + 1, len(pASFFLSsList), 1): if j in pos_visit: continue pos_j = contain_PASFFW_by_Fixtokens(pASFFLSsList[j], pASFFLSsListList) if pos_j == pos_i: pASFFLSsListList[pos_i].append(pASFFLSsList[j]) pos_visit.append(j) pASFFLSs_print = '' for pASFFLSsList in pASFFLSsListList: pASFFLSs_print += "same fix tokens: %s\n" % str(pASFFLSsList[0][4]) for pASFFLSs in pASFFLSsList: pASFFLSs_print += pASFFLSs[0] + ', ' + pASFFLSs[1] + ', ' + str( pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n' pASFFLSs_print += '----------\n' logger.info("merge by the same fix tokens: \n" + pASFFLSs_print) final_pASFFLSsList = [] for pASFFLSsList in pASFFLSsListList: final_pASFFLSsList.append(get_final_pASFFLSs(pASFFLSsList)) final_pASFFLSsList_print = '' for pASFFLSs in final_pASFFLSsList: final_pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[ 1] + ', ' + str(pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n' logger.info("get the final PASFFWList: \n" + final_pASFFLSsList_print) # logger.info("get the final PASFFWList: \n"+str(final_pASFFLSsList)) final_pattern_list = mergeNearPatterns(final_pASFFLSsList) return final_pattern_list