def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 temp = [] # 提取括号内容 for term in tqdm(termList): newTerm, bracketTerm = removeBracket(term) temp.append(newTerm) temp.extend(bracketTerm) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 temp = [] # 根据标点拆分词条, 取第一段 for term in tqdm(termList): temp.append(re.split('[,,]', term, maxsplit=1)[0]) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 termList = [term for term in termList if not isDescription(term)] # 去掉叙述性的语句 temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = [subword.strip() for subword in re.split('[,,]', term)] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = [ term for term in termList if len(term) > 1 and len(term) < 20 and containCNS(term) ] # 长度大于1 且 长度小于20 且 包含中文 termList = [term for term in termList if re.match('^见[\w]+', term) is None] termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def baikeScript(): termList = uniqueList( open(folder + os.sep + 'umls_wikipedia.txt').read().splitlines() + open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines()) termList = sortedByLength(termList) print('size={}'.format(len(termList))) print('\n'.join(termList), file=open(folder + os.sep + 'umls_baike.txt', 'w'))
def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 print('size={}'.format(len(termList))) termList = [ term for term in tqdm(termList) if len(term) > 1 and containCNS(term) ] # 初筛: 长度大于1 且 包含中文 print('size={}'.format(len(termList))) temp = [] # 提取括号内容 for term in tqdm(termList): newTerm, bracketTerm = removeBracket(term) temp.append(newTerm) temp.extend(bracketTerm) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = re.split('[,;;]', term) # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号) subList = [subword.strip() for subword in subList] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in tqdm(termList) if not allDigit(term)] # 去掉纯数字 termList = [ term for term in tqdm(termList) if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) termList = [ term for term in tqdm(termList) if len(term) > 1 and len(term) < 20 and allCNS(term) ] # 终筛: 长度大于1 且 长度小于20 且 全中文 print('size={}'.format(len(termList))) termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList) # 按长度排序
def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = [ term for term in termList if len(term) > 1 and len(term) < 20 and allCNS(term) ] # 终筛: 长度大于1 且 长度小于20 且 仅包含中文 termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def getICIBATermList(): """取金山词霸 """ termList = [] gradedMT = pickle.load( open(DATA_PATH + os.sep + 'umlsMT' + os.sep + 'GradedMT.pkl', 'rb')) for AUI, infoDict in tqdm(gradedMT.items()): if 'ICIBA' in infoDict['source']: termList.append(infoDict['source']['ICIBA']) termList = uniqueList(termList) return termList
def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = [subword.strip() for subword in re.split('[,,]', term)] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [term for term in termList if not containUselessDigitTerm(term)] # 去掉包含时间词、温度词的词 termList = [term for term in termList if not (len(term) == 1 and not isCNS(term))] # 去掉非汉字单字 termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) print('size={}'.format(len(termList))) return sortedByLength(termList)
def script2(): from segment.umls import baiduBaikeFilter, wikiPediaFilter termList = open(folder + os.sep + 'umls_bgequal.txt').read().splitlines() baiduTermList = baiduBaikeFilter( termList, open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines()) wikiTermList = wikiPediaFilter(termList) termList = sortedByLength(uniqueList(baiduTermList + wikiTermList)) print('size: {}'.format(len(termList))) print('\n'.join(termList), file=open(folder + os.sep + 'umls_bgequal_baike.txt', 'w'))
def genSnomedctSegment(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 temp = [] # 提取括号内容 for term in tqdm(termList): newTerm, bracketTerm = removeBracket(term) temp.append(newTerm) temp.extend(bracketTerm) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = [subword.strip() for subword in re.split('[,,::]', term)] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 print('size={}'.format(len(termList))) termList = [ term for term in termList if len(term) > 1 and containCNS(term) ] # 长度大于1 且 包含中文 print('size={}'.format(len(termList))) termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def genBGUnorderEqual(): """取百度翻译和谷歌翻译无序相等 """ gradedMT = pickle.load( open(DATA_PATH + os.sep + 'umlsMT' + os.sep + 'GradedMT.pkl', 'rb')) termList = [] analyzer = StandardAnalyzer() for AUI, infoDict in tqdm(gradedMT.items()): if infoDict['confidence'] > 5: continue baidu = infoDict['source'].get('baidu', '') google = infoDict['source'].get('google', '') if set(analyzer.split(baidu)) == set( analyzer.split(google)): # 百度翻译与谷歌翻译无序相等 termList.append(baidu) termList.append(google) termList = uniqueList(termList) return termList