if 'E' in indexList[i + 1]: ne = neWordList[i] + neWordList[i + 1] # print ne type = 'S' + '-' + indexList[i].split('-')[-1] finalNamedEntityList.append((ne,type)) if 'S' in indexList[i]: ne = neWordList[i] # print ne finalNamedEntityList.append((ne,indexList[i])) return finalNamedEntityList if __name__ == '__main__': origin_corpus = inout.getDataOriginPath('origin_corpus_test.txt') infoList = inout.readListFromTxt(origin_corpus) # 加入从文本中过滤句子的逻辑 infoList = getSentenceList(infoList) print '原始总的句子个数: ',len(infoList) # exit(0) # 初始化两个同步的列表 sentenceList = [] sentenceFeatureList = [] i = 0 ## 命名实体识别 for sentence in infoList:
analysisPath = inout.getDataAnalysisPath('analysis.txt') ## 加载停用词列表 stopWordPath = inout.getResourcePath('stopWordList.txt') stopWordList = inout.readListFromTxt(stopWordPath) # 这地方加入临时逻辑,后面可以进行停用词合并 stopWordList = list(set(stopWordList)) ## 加载关系字典 relationDic = persistent_relation_object.getRelationShipDic() pd.set_option('display.width', 300) np.set_printoptions(linewidth=300, suppress=True) # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt') corpusPath = inout.getDataOriginPath('special_corpus.txt') ## 1 对于复杂的文本数据要进行清洗 # 分句可以用在初次清晰文本过程中 # corpus = inout.onlyReadLine(corpusPath) # sentences = SentenceSplitter.split(corpus) # sentences = '\t'.join(sentences)#.decode('utf-8') # sentenceList = sentences.split('\t') # 直接从文本读取list数据 sentenceList = inout.readListFromTxt(corpusPath) printEscapeStr(sentenceList) # exit(0) # 命名实体类别字典 neTypeDic = getNamedEntityTypeDic() for originSentenceI in range(len(sentenceList)):
#!/usr/bin/env python # -*- coding:utf-8 -*- from utils import inout from utils.inout import printEscapeStr import jieba import codecs if __name__ == '__main__': inFilePath = inout.getDataOriginPath('snow_white_origin.txt') outFilePath = inout.getDataOriginPath('snow_white_clean.txt') infoList = inout.readListFromTxt(inFilePath) outputStr = '' for item in infoList: sentence = item.replace('\t', '').replace('\n', '').replace('\r', '').strip() sentenceSplitList = jieba.cut(sentence) outputStr = outputStr + ' '.join(sentenceSplitList) + ' ' outputStr = outputStr + '\n' fw = codecs.open(outFilePath, 'w', 'utf-8') fw.write(outputStr) fw.close()
def packageWord2Sentence(sentenceWordList): ''' 将词列表形式转化为句子 ''' sentence = '' for item in sentenceWordList: sentence = sentence + item.split('_')[0].strip() return sentence if __name__ == '__main__': origin_corpus = inout.getDataOriginPath('origin_corpus_cmpp.txt') # origin_corpus = inout.getDataTestPath('origin_corpus_test.txt') infoList = inout.readListFromTxt(origin_corpus) # 初始化两个同步的列表 sentenceList = [] sentenceFeatureList = [] j = 0 for i in range(len(infoList)): line = infoList[i].strip() if line: if line != '_w _w': lineList = line.split(' ')
ne = neWordList[i] + neWordList[i + 1] # print ne type = 'S' + '-' + indexList[i].split('-')[-1] finalNamedEntityList.append((ne, type)) if 'S' in indexList[i]: ne = neWordList[i] # print ne finalNamedEntityList.append((ne, indexList[i])) return finalNamedEntityList if __name__ == '__main__': inputFileName = '' origin_corpus = inout.getDataOriginPath(inputFileName) # origin_corpus = inout.getDataTestPath('origin_corpus_test.txt') infoList = inout.readListFromTxt(origin_corpus) print '加载数据完毕...' # 初始化两个同步的列表 sentenceList = [] sentenceFeatureList = [] j = 0 for i in range(len(infoList)): line = infoList[i].strip() if line: if line != '_w _w':
# outputLine = line.strip() # fw.write(outputLine + '\n') resultList.append(line.strip()) # i += 1 print i # fw.close() fr.close() outFilePath = inout.getDataOriginPath('graph_candidate_entity_relation_150w-2100w.txt') inout.writeList2Txt(outFilePath,resultList) print '写入完成...' """ test: 测试了原来pkl对象内部的结构: [[('尼古拉斯', 'S-Nh'), ('奥古斯特·科波拉', 'S-Nh')], ['生于', '加州', '一个', '中产', '家庭', ',', '意大利', '裔', '父亲', '是', '文学', '教授', ',', '德国裔', '的', '母亲', 'Joy']] """ # sentenceFeatureList,typeList = inout.readPersistObject(inout.getDataPklPath('sentence_feature_list_corpus_complete_sentence.pkl'))