def loadIndexSentenceList():
    '''
        加载包含索引信息的句子列表
    '''
    sentenceList = []
    sentenceFeatureList = []

    ## 1
    # fnlpListPath = inout.getDataNEMeatPath('sentence_and_feature_150w-900w_fnlp_old.txt')
    # fnlpListPath = inout.getDataTestPath('sentence_and_feature_test.txt')
    fnlpListPath = '/data/wangtd/workspace/re/data/sentence_and_feature_150w-900w_fnlp_old.txt'

    fnlpOldDataList = inout.readListFromTxt(fnlpListPath)
    print'原始-数据-旧 len:', len(fnlpOldDataList)

    fnlpSentenceList_old, fnlpSentenceFeatureList_old = convertDataFormat(fnlpOldDataList)
    print'处理-数据-旧 len:', len(fnlpSentenceList_old)

    sentenceList.extend(fnlpSentenceList_old)
    sentenceFeatureList.extend(fnlpSentenceFeatureList_old)

    ## 2
    # fnlpNewDataListPath = inout.getDataNEMeatPath('sentence_and_feature_900w-2100w_fnlp_new.txt')
    # fnlpNewDataListPath = inout.getDataTestPath('sentence_and_feature_test_new.txt')
    fnlpNewDataListPath = '/data/wangtd/workspace/re/data/sentence_and_feature_900w-2100w_fnlp_new.txt'

    fnlpNewDataList = inout.readListFromTxt(fnlpNewDataListPath)
    print'原始-数据-新 len:', len(fnlpNewDataList)

    fnlpSentenceList_new, fnlpSentenceFeatureList_new = convertNewDataFormat(fnlpNewDataList)
    print'处理-数据-新 len:', len(fnlpSentenceList_new)

    sentenceList.extend(fnlpSentenceList_new)
    sentenceFeatureList.extend(fnlpSentenceFeatureList_new)

    print 'sentenceList len: ', len(sentenceList)
    print 'sentenceFeatureList len: ', len(sentenceFeatureList)
    print '数据加载完毕...'

    sentenceList, sentenceFeatureList = dictDistinct(sentenceList, sentenceFeatureList)
    print '句子去重复完成...',len(sentenceList)

    return sentenceList,sentenceFeatureList
def getRelationShipDic():
    asymmetricInFilePath = inout.getResourcePath('asymmetricRelationShip.txt')
    symmetricInFilePath = inout.getResourcePath('symmetricRelationShip.txt')

    infoList = inout.readListFromTxt(asymmetricInFilePath)
    infoList.extend(inout.readListFromTxt(symmetricInFilePath))
    print '归一化总关系数量:', len(infoList)

    # 初始化持久化对象字典
    initDic = dict()

    for lineItem in infoList:
        lineList = lineItem.strip().split('\t')
        key = lineList[0].strip()
        valueList = lineList[-1].strip()[1:-1].replace(' ', '').split(',')
        ## 这是处理的第一种方法
        initDic[key] = valueList
        ## 还可以有第二中方法

    return initDic
Пример #3
0
    return False


if __name__ == '__main__':
    """
        处理4500后的数据,取一个差集
    """

    blackWordList = ['利物浦','凤凰','CEO','AC米兰','阿森纳','尤文图斯','TD','欧战'\
                     'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',\
                     'Q','R','S','T','U','V','W','X','Y','Z'\
                     '欧洲杯','主任','沃尔沃','恒大','鲁能','胡润','周岁'\
                     '0','1','2','3','4','5','6','7','8','9'\
                     '双冰','詹皇','尤文','王总']

    momInfo = inout.readListFromTxt(
        'D:/michaelD/kg/entity_relation_weight_4500.txt')
    sonInfo = inout.readListFromTxt('D:/michaelD/kg/4500_after_data.txt')
    outPath = 'D:/michaelD/kg/after_tp_1.txt'

    momWordList = []
    for momItem in momInfo:
        wordArr = momItem.split('\t')[0].split(' ')
        momWordList.append((wordArr[0], wordArr[1]))

    middleList = []
    for sonItem in sonInfo:
        sonWordArr = sonItem.split('\t')[0].split(' ')
        if isExist(sonWordArr, momWordList):
            continue
        middleList.append(sonItem)
    print 'middleList:', len(middleList)
            # finally:
            #     articleTime = mildTime
        else:
            articleTime = coldTime

        weight = (articleTime - basicTime) / (systemTime - basicTime)
    return weight


if __name__ == '__main__':

    ## 加载包含索引信息的句子到内存
    # sentenceList,sentenceFeatureList = loadIndexSentenceList()

    sentenceFilePath = inout.getDataAnalysisPath('sentenceList.txt')
    sentenceList = inout.readListFromTxt(sentenceFilePath)

    # for item in sentenceList:
    #     print item
    # exit(0)

    inFilePath = inout.getDataAnalysisPath('vote_classify_module_result_fnlp_150w-2100w.txt')
    # inFilePath = '/data/wangtd/workspace/re/vote_classify_module_result_fnlp_150w-2100w.txt'

    outFilePath = inout.getDataAnalysisPath('vote_relation_weight_result_fnlp_150w-2000w.txt')
    # outFilePath = '/data/wangtd/workspace/re/vote_relation_weight_result_fnlp_150w-2000w.txt'

    infoList = inout.readListFromTxt(inFilePath)

    ## 开始处理
    fw = codecs.open(outFilePath,'wb')
Пример #5
0
                    # print ne
                    type = 'S' + '-' + indexList[i].split('-')[-1]
                    finalNamedEntityList.append((ne,type))
        if 'S' in indexList[i]:
            ne = neWordList[i]
            # print ne
            finalNamedEntityList.append((ne,indexList[i]))
    return finalNamedEntityList



if __name__ == '__main__':

    origin_corpus = inout.getDataOriginPath('origin_corpus_test.txt')

    infoList = inout.readListFromTxt(origin_corpus)

    # 加入从文本中过滤句子的逻辑
    infoList = getSentenceList(infoList)
    print '原始总的句子个数: ',len(infoList)
    # exit(0)

    # 初始化两个同步的列表
    sentenceList = []
    sentenceFeatureList = []

    i = 0
    ## 命名实体识别
    for sentence in infoList:

        namedEntityTagTupleList, neTagList = namedEntityRecognize(sentence)
Пример #6
0
    '''
    resultStr = ''
    for item in labelWordWeightList:
        resultStr = resultStr + '(' + str(item[0]) + ',' + str(
            item[1]) + ')' + ','
    return resultStr


if __name__ == '__main__':

    analysisPath = inout.getDataAnalysisPath('analysis.txt')

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt')
    corpusPath = inout.getDataOriginPath('special_corpus.txt')

    ## 1 对于复杂的文本数据要进行清洗
    # 分句可以用在初次清晰文本过程中
    # corpus = inout.onlyReadLine(corpusPath)
if __name__ == '__main__':

    # 输出路径
    outputPath = inout.getDataAnalysisPath(
        'analysis_vote_sentence_fnlp_150w-2100w.txt')
    # outputPath = inout.getDataAnalysisPath('analysis_vote_sentence_0615.txt')
    # outputPath = inout.getDataAnalysisPath('analysis_test.txt')

    ## 配置
    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    ## 作为模块的入口,加载对象
    """
            这里加载数据的处理策略:
            1)从pkl对象直接加载
            2)从文本文件读取数据形成列表
            最后的数据都以列表形式合并成一个总的列表
    """
    sentenceList = []
    sentenceFeatureList = []
    print sentenceTwo
    sortedRelationList = eval(sentenceTwo)
    print type(sortedRelationList)
    print sortedRelationList
    exit(0)


if __name__ == '__main__':

    inFilePath = inout.getDataAnalysisPath(
        'vote_relation_ordered_result_fnlp_150w-2000w.txt')

    outFilePath = inout.getDataAnalysisPath(
        'vote_relation_ordered_result_fnlp_150w-2000w_handled.txt')

    infoList = inout.readListFromTxt(inFilePath)

    print 'info list len:', len(infoList)

    allSentenceList = []
    itemSentenceList = []
    for item in infoList:
        item = item.strip()
        if item != '':
            # print item
            itemSentenceList.append(item)
        else:
            # print '|' + item + '|'
            allSentenceList.append(itemSentenceList)
            itemSentenceList = []
Пример #9
0
# -*- coding:utf-8 -*-

from utils import inout

if __name__ == '__main__':

    infoList = inout.readListFromTxt('D:/michaelD/kg/entity_relation_weight_origin.txt')
    # print len(infoList)
    # exit(0)

    outFilePath = 'D:/michaelD/kg/entity_relation_weight.txt'

    outputList = []
    for line in infoList:
        line = line.strip()

        # if '成员' in line:
        resultLine = line.replace('成员','朋友')
        resultLine = resultLine.replace('middot;','')
        resultLine = resultLine.replace('Paytm','')
        resultLine = resultLine.replace(',','')
        resultLine = resultLine.replace('&middot','')#·

        outputList.append(resultLine)

    inout.writeList2Txt(outFilePath,outputList)

    sumWeight = getSumWeight(weightList)

    candidateWeight = candidateWeight / sumWeight

    candidateWeight = round(candidateWeight * 100,2)
    # print candidateWeight

    # exit(0)
    return candidateWeight


if __name__ == '__main__':

    rootDir = 'D:/michaelD/kg/'

    infoList = inout.readListFromTxt(rootDir + 'rEOrigin_sen_20.txt')

    outputFilePath = rootDir + 'entity_relation_weight.txt'

    groupList = []
    tupleList = []
    for line in infoList:
        line = line.strip()
        if '候选关系:【' in line:
            if tupleList:
                groupList.append(tupleList)
                tupleList = []
        tupleList.append(line)

    outputList = []
    for tupleItemList in groupList:
# -*- coding:utf-8 -*-

from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from utils import inout
import index

if __name__ == '__main__':

    segmentor = Segmentor()
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt'))
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))

    infoList = inout.readListFromTxt('./dn_test.txt')

    for sentence in infoList:

        # segmentor.load(inout.getLTPPath(index.CWS))
        words = segmentor.segment(sentence)
        postags = postagger.postag(words)
        # result = zip(words,postags)
        # inout.printEscapeStr(result)


    segmentor.release()
    postagger.release()

    # recognizer = NamedEntityRecognizer()
    # recognizer.load(inout.getLTPPath(index.NER))
Пример #12
0
    ## 输入参数
    n_cluster = 300
    # n_cluster = 15000

    corpusNum = 500

    analysisPath = inout.getDataAnalysisPath('analysis_cluster_sentence.txt')

    ## 配置
    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    ## 作为模块的入口,加载对象
    sentencePath = inout.getDataPklPath(
        'sentence_list_corpus_complete_sentence.pkl')
    sentenceFeaturePath = inout.getDataPklPath(
        'sentence_feature_list_corpus_complete_sentence.pkl')

    sentenceList, slType = inout.readPersistObject(sentencePath)
    sentenceFeatureList, sflType = inout.readPersistObject(sentenceFeaturePath)