예제 #1
0
                if 'E' in indexList[i + 1]:
                    ne = neWordList[i] + neWordList[i + 1]
                    # print ne
                    type = 'S' + '-' + indexList[i].split('-')[-1]
                    finalNamedEntityList.append((ne,type))
        if 'S' in indexList[i]:
            ne = neWordList[i]
            # print ne
            finalNamedEntityList.append((ne,indexList[i]))
    return finalNamedEntityList



if __name__ == '__main__':

    origin_corpus = inout.getDataOriginPath('origin_corpus_test.txt')

    infoList = inout.readListFromTxt(origin_corpus)

    # 加入从文本中过滤句子的逻辑
    infoList = getSentenceList(infoList)
    print '原始总的句子个数: ',len(infoList)
    # exit(0)

    # 初始化两个同步的列表
    sentenceList = []
    sentenceFeatureList = []

    i = 0
    ## 命名实体识别
    for sentence in infoList:
예제 #2
0
    analysisPath = inout.getDataAnalysisPath('analysis.txt')

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt')
    corpusPath = inout.getDataOriginPath('special_corpus.txt')

    ## 1 对于复杂的文本数据要进行清洗
    # 分句可以用在初次清晰文本过程中
    # corpus = inout.onlyReadLine(corpusPath)
    # sentences = SentenceSplitter.split(corpus)
    # sentences = '\t'.join(sentences)#.decode('utf-8')
    # sentenceList = sentences.split('\t')
    # 直接从文本读取list数据
    sentenceList = inout.readListFromTxt(corpusPath)
    printEscapeStr(sentenceList)
    # exit(0)
    # 命名实体类别字典
    neTypeDic = getNamedEntityTypeDic()

    for originSentenceI in range(len(sentenceList)):
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from utils import inout
from utils.inout import printEscapeStr
import jieba
import codecs

if __name__ == '__main__':

    inFilePath = inout.getDataOriginPath('snow_white_origin.txt')
    outFilePath = inout.getDataOriginPath('snow_white_clean.txt')

    infoList = inout.readListFromTxt(inFilePath)

    outputStr = ''

    for item in infoList:
        sentence = item.replace('\t', '').replace('\n',
                                                  '').replace('\r',
                                                              '').strip()
        sentenceSplitList = jieba.cut(sentence)
        outputStr = outputStr + ' '.join(sentenceSplitList) + ' '

    outputStr = outputStr + '\n'
    fw = codecs.open(outFilePath, 'w', 'utf-8')
    fw.write(outputStr)
    fw.close()
def packageWord2Sentence(sentenceWordList):
    '''
        将词列表形式转化为句子
    '''
    sentence = ''
    for item in sentenceWordList:
        sentence = sentence + item.split('_')[0].strip()
    return sentence





if __name__ == '__main__':
    origin_corpus = inout.getDataOriginPath('origin_corpus_cmpp.txt')
    # origin_corpus = inout.getDataTestPath('origin_corpus_test.txt')

    infoList = inout.readListFromTxt(origin_corpus)

    # 初始化两个同步的列表
    sentenceList = []
    sentenceFeatureList = []

    j = 0
    for i in range(len(infoList)):
        line = infoList[i].strip()
        if line:
            if line != '_w  _w':
                lineList = line.split(' ')
예제 #5
0
                    ne = neWordList[i] + neWordList[i + 1]
                    # print ne
                    type = 'S' + '-' + indexList[i].split('-')[-1]
                    finalNamedEntityList.append((ne, type))
        if 'S' in indexList[i]:
            ne = neWordList[i]
            # print ne
            finalNamedEntityList.append((ne, indexList[i]))
    return finalNamedEntityList


if __name__ == '__main__':

    inputFileName = ''

    origin_corpus = inout.getDataOriginPath(inputFileName)
    # origin_corpus = inout.getDataTestPath('origin_corpus_test.txt')

    infoList = inout.readListFromTxt(origin_corpus)

    print '加载数据完毕...'

    # 初始化两个同步的列表
    sentenceList = []
    sentenceFeatureList = []

    j = 0
    for i in range(len(infoList)):
        line = infoList[i].strip()
        if line:
            if line != '_w  _w':
예제 #6
0
                # outputLine = line.strip()

                # fw.write(outputLine + '\n')

                resultList.append(line.strip())
                # i += 1
        print i

    # fw.close()
    fr.close()




    outFilePath = inout.getDataOriginPath('graph_candidate_entity_relation_150w-2100w.txt')
    inout.writeList2Txt(outFilePath,resultList)

    print '写入完成...'






    """
        test:
        测试了原来pkl对象内部的结构:
        [[('尼古拉斯', 'S-Nh'), ('奥古斯特·科波拉', 'S-Nh')], ['生于', '加州', '一个', '中产', '家庭', ',', '意大利', '裔', '父亲', '是', '文学', '教授', ',', '德国裔', '的', '母亲', 'Joy']]
    """
    # sentenceFeatureList,typeList = inout.readPersistObject(inout.getDataPklPath('sentence_feature_list_corpus_complete_sentence.pkl'))