def find_candidate_pattern(patterns, sent_pos):
    candidate_patterns = []
    for pattern in patterns:
        #         start=datetime.datetime.now()
        if is_candidate_pattern(pattern, sent_pos):
            logger.info(pattern + ": is candidate")
            candidate_patterns.append(pattern)
#         end=datetime.datetime.now()
#         print (end-start).microseconds
    return candidate_patterns
def getVBContext(attribute2prefixList):
    VBContextList = []
    #     for item in attribute2prefixList.iteritems():
    #         for prefix in item[1]:
    prefixList = []
    for attribute, prefix in attribute2prefixList.iteritems():
        prefixList.extend(prefix)

    prefixListContext = []

    for prefix in prefixList:
        POSList = prefix[0].split('+')
        VBPos = indexVB(POSList)
        if VBPos == -1:
            continue
        if VBPos >= len(POSList) - 2:
            if len(POSList) <= 3:
                context = '+'.join(POSList[:])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))
            else:
                context = '+'.join(POSList[len(POSList) - 3:])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))
        elif VBPos == len(POSList) - 3:
            if len(POSList) <= 4:
                context = '+'.join(POSList[:-1])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))
            else:
                context = '+'.join(POSList[len(POSList) - 4:-1])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))
        elif VBPos == len(POSList) - 4:
            if len(POSList) <= 5:
                context = '+'.join(POSList[:-2])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))
            else:
                context = '+'.join(POSList[len(POSList) - 5:-2])
                VBContextList.append(context)
                prefixListContext.append(
                    (context, prefix[0], prefix[1], prefix[2]))

    logger.info('\n----------------VBContext num: ------------------')
    c_VBContext = Counter(VBContextList)
    for item in sorted(c_VBContext.iteritems(),
                       key=lambda asd: asd[1],
                       reverse=True):
        logger.info(item[0] + ': ' + str(item[1]))

    logger.info('\n----------------VBContext seq: ------------------')
    for tup in sorted(prefixListContext, key=lambda asd: asd[0],
                      reverse=False):
        logger.info(tup)
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = "items_tagged_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
    path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json"
    path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json"
    path_pattern = "Patterns_auto.json"
    path_pattern_sorted = "pattern_auto_sorted_by_attribute_pattern.json"

    #     path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    #acquire all the pattern candidates
    attribute2PSFSList = acquire_patterns(data)
    #     getVBContext(attribute2PSFSList)
    #filter the pattern candidates
    PASFFLSsList = filterPatterns(attribute2PSFSList)
    #merge the pattern candidates
    patterns = merge_pattern_all(PASFFLSsList)

    patterns_priority = calculate_proprity2pattern(patterns)
    logger.info("has acquired all the patterns")
    json.dump(patterns_priority,
              codecs.open(path_pattern, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    #     json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2)
    #     dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted)
    sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted)
    logger.info("output over")
def get_final_tokens(pASFFLSsList):
    #合并所有可能的首尾有交叉的pattern 但是只有在同一个句子中出现才能被合并
    #如果跟任何一个都不相交  就保留下来
    if len(pASFFLSsList) == 1:
        return pASFFLSsList
    pASFFLSsList_new = []
    visit_ele = set([])
    for i in range(len(pASFFLSsList)):
        #         logger.info('i=%d' % i)
        #用以判断第i个pASFFLSs和其后的pASFFLSs是否有相交 如果没有 且没有被别的pASFFLSs 比较过 ,则它自己就可以作为单独的pASFFLSs放入最终的结果
        cmp_result = []
        for j in range(i + 1, len(pASFFLSsList), 1):
            cmp_ = is_intersectPASFFW(pASFFLSsList[i], pASFFLSsList[j])
            if cmp_ == -1:
                #                 logger.info((' have no intersect ',PASFFWList[i],PASFFWList[j]))
                continue
            else:
                visit_ele.add(i)
                visit_ele.add(j)
                pASFFLSsList_new.append(cmp_)
            cmp_result.append(cmp_)

        if len(cmp_result) == 0 and (i not in visit_ele):
            pASFFLSsList_new.append(pASFFLSsList[i])


#             logger.info(('append: ',PASFFWList[i]))
        elif len(cmp_result) > 0 and max(cmp_result) == -1 and (
                i not in visit_ele):
            pASFFLSsList_new.append(pASFFLSsList[i])

    pASFFLSsList_new_RemoveDup = removeDupPASFFLSs(pASFFLSsList_new)
    logger.info('final merge result:')
    for x in pASFFLSsList_new_RemoveDup:
        logger.info(x)
    del pASFFLSsList[:]
    pASFFLSsList.extend(pASFFLSsList_new_RemoveDup)
示例#5
0
def getChunkPrefix(size,attribute_value_tokens,sentTokens):
    match_pos=get_match_pos(attribute_value_tokens,sentTokens)
    can_pos_set=[]
    start=match_pos[0]
    if start==-1:
        logger.info('this value cannot match')
#         logger.info('sent:'+str((sentTokens)))
#         logger.info('attribute_value:'+str((attribute_value_tokens)))
        return []
    
    window=[]
    prefixShiftSentList=[]
    '''
    when the start<size, we fetch the window util start from 0,
    when the start>=size, we fetch the window util start from start-size,
    '''
    
    for i in range(CYCLE):
#     for i in range(size):
        if start<size and start<len(sentTokens):
            can_pos_set=get_can_pos_set(start)
            window=sentTokens[:start]
        elif start>=size and start<len(sentTokens):
            can_pos_set=get_can_pos_set(size)
            window=sentTokens[start-size:start]
        else:
            continue
        
        for can_pos in can_pos_set:
            prefix=[]
            for pos in can_pos:
                prefix.append(window[pos])
            prefixShiftSentList.append((prefix,i,sentTokens))
        start=start+1
    
    return  prefixShiftSentList
def mergeNearPatterns(pASFFLSsList):
    #传入的PASFFWList是一个经过相同fix tokens合并的
    #迭代收敛的条件是  PASFFWList的大小不变

    pASFFLSsList_len = []
    while True:
        pASFFLSsList_len.append(len(pASFFLSsList))
        get_final_tokens(pASFFLSsList)
        if len(pASFFLSsList_len) < 3:
            continue
        else:
            if pASFFLSsList_len[-1] == pASFFLSsList_len[
                    -2] == pASFFLSsList_len[-3]:
                break

    final_pASFFLSsList_print = ''
    for pASFFLSs in pASFFLSsList:
        final_pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[
            1] + ', ' + str(pASFFLSs[2]) + ', ' + str(
                pASFFLSs[3]) + ', ' + str(pASFFLSs[4]) + '\n'
    logger.info("get the final merged pASFFLSsList: \n" +
                final_pASFFLSsList_print)
    logger.info(str(pASFFLSsList))

    final_pattern_can = []
    for pASFFLSs in pASFFLSsList:
        pattern = get_final_pattern(pASFFLSs)
        if pattern[:pattern.index('$')] == '':
            #             final_pattern.extend([getWholePattern(x) for x in pASFFLSsList])
            final_pattern_can.append(getWholePattern(pASFFLSs))
        else:
            final_pattern_can.append(pattern)

    final_pattern_print = ''
    for pattern in final_pattern_can:
        final_pattern_print += pattern + '\n'
    logger.info("get the final patterns can: \n" + final_pattern_print)

    final_pattern = remove_dup_pattern(final_pattern_can)

    return final_pattern
    #     getVBContext(attribute2PSFSList)
    #filter the pattern candidates
    PASFFLSsList = filterPatterns(attribute2PSFSList)
    #merge the pattern candidates
    patterns = merge_pattern_all(PASFFLSsList)

    patterns_priority = calculate_proprity2pattern(patterns)
    logger.info("has acquired all the patterns")
    json.dump(patterns_priority,
              codecs.open(path_pattern, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    #     json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2)
    #     dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted)
    sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted)
    logger.info("output over")


if __name__ == '__main__':
    start = datetime.datetime.now()
    logger.info(start)
    print start
    main()
    #     test_get_final_tokens()
    #     test_get_final_pattern()
    #     test_getChunkPrefix()
    end = datetime.datetime.now()
    logger.info(end)
    print end
    logger.info("cost time: " + str((end - start).microseconds))
    print str((end - start).seconds) + ' s'
示例#8
0
def filterPatterns(attribute2PSFSList):
    logger.info('------------------filter the patterns--------------------')
    attribute2PSFFLSsList = {}
    for attribue, PSFSList in attribute2PSFSList.iteritems():
        print 'attribute: ' + attribue, "attribute length:", len(PSFSList)
        logger.info('final attribute: %s' % attribue)
        logger.info('patterns can original length: ' + str(len(PSFSList)))
        filterPSFFLSsList = filterGrammerWrong(PSFSList)
        attribute2PSFFLSsList[attribue] = filterPSFFLSsList
        logger.info('final patterns length %d:' % len(filterPSFFLSsList))
        logger.info(str(filterPSFFLSsList) + '\n')
    PASFFLSsList = filterPatternNotOnly(attribute2PSFFLSsList)
    logger.info("final PASFFLSs: ")
    for PASFFLSs in PASFFLSsList:
        logger.info(PASFFLSs)
    return PASFFLSsList
def merge_pattern(pASFFLSsList):
    print "----------------merge patterns of attribute: %s ----------------" % pASFFLSsList[
        0][1]
    logger.info(
        "----------------merge patterns of attribute: %s ----------------" %
        pASFFLSsList[0][1])
    #PASFFW: patternAttributeShiftFreqFixtokenWindow
    #将来自同一fix_tokens的pattern合并到一个list中
    pASFFLSsList_print = ''
    for pASFFLSs in pASFFLSsList:
        pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[1] + ', ' + str(
            pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n'

    logger.info("---------------patterns before merging :\n%s---------------" %
                pASFFLSsList_print)

    pASFFLSsListList = []
    pos_visit = []
    for i in range(len(pASFFLSsList)):
        #如果i被访问过 则pASFFLSsList[i]就一定已经被添加到某个子list中
        if i in pos_visit:
            continue
        pos_i = contain_PASFFW_by_Fixtokens(pASFFLSsList[i], pASFFLSsListList)
        if pos_i == -1:
            pattern_can_i = []
            pattern_can_i.append(pASFFLSsList[i])
            pos_visit.append(i)
            pASFFLSsListList.append(pattern_can_i)
        pos_i = contain_PASFFW_by_Fixtokens(pASFFLSsList[i], pASFFLSsListList)
        for j in range(i + 1, len(pASFFLSsList), 1):
            if j in pos_visit:
                continue
            pos_j = contain_PASFFW_by_Fixtokens(pASFFLSsList[j],
                                                pASFFLSsListList)
            if pos_j == pos_i:
                pASFFLSsListList[pos_i].append(pASFFLSsList[j])
                pos_visit.append(j)

    pASFFLSs_print = ''
    for pASFFLSsList in pASFFLSsListList:
        pASFFLSs_print += "same fix tokens: %s\n" % str(pASFFLSsList[0][4])
        for pASFFLSs in pASFFLSsList:
            pASFFLSs_print += pASFFLSs[0] + ', ' + pASFFLSs[1] + ', ' + str(
                pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n'
        pASFFLSs_print += '----------\n'
    logger.info("merge by the same fix tokens: \n" + pASFFLSs_print)

    final_pASFFLSsList = []
    for pASFFLSsList in pASFFLSsListList:
        final_pASFFLSsList.append(get_final_pASFFLSs(pASFFLSsList))

    final_pASFFLSsList_print = ''
    for pASFFLSs in final_pASFFLSsList:
        final_pASFFLSsList_print += pASFFLSs[0] + ', ' + pASFFLSs[
            1] + ', ' + str(pASFFLSs[2]) + ', ' + str(pASFFLSs[3]) + '\n'
    logger.info("get the final PASFFWList: \n" + final_pASFFLSsList_print)

    #     logger.info("get the final PASFFWList: \n"+str(final_pASFFLSsList))
    final_pattern_list = mergeNearPatterns(final_pASFFLSsList)

    return final_pattern_list