Exemplo n.º 1
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList
                if not isDescription(term)]  # 去掉叙述性的语句

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and containCNS(term)
    ]  # 长度大于1 且 长度小于20 且 包含中文
    termList = [term for term in termList if re.match('^见[\w]+', term) is None]

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Exemplo n.º 2
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList) if len(term) > 1 and containCNS(term)
    ]  # 初筛: 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = re.split('[,;;]', term)  # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号)
        subList = [subword.strip() for subword in subList]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in tqdm(termList) if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in tqdm(termList) if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList)
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 全中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)  # 按长度排序
Exemplo n.º 3
0
def genSnomedctSegment(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,::]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    print('size={}'.format(len(termList)))

    termList = [
        term for term in termList if len(term) > 1 and containCNS(term)
    ]  # 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Exemplo n.º 4
0
	def genMT(self):
		allMapDict = {}
		prtMapperList = [
			UMLSCHIMapGenerator(), HPOMapGenerator(), SNOMEDMapGenerator1(), ICD10MapGenerator(),
			ICIBAMapGenerator(), SNOMEDMapGenerator2(), MeSHMapGenerator()
		]
		for mapper in prtMapperList:
			mapDict = mapper.getAUIMapDict()
			allMapDict[mapper.source] = mapDict
			print('{}: {}'.format(mapper.source, len(mapDict)))

		retDict = {}   #
		for source, mapDict in allMapDict.items():
			self.addSource(retDict, source, mapDict)
		auiMapUmlsTerm = self.getAUIMapUmlsTerm()
		self.addUmlsTerm(retDict, auiMapUmlsTerm)

		for AUI, item in retDict.items():
			for mapper in prtMapperList: # 采用了mapDict的翻译
				source = mapper.source
				if source in retDict[AUI]['source']:
					retDict[AUI]['preferSource'] = source
					retDict[AUI]['prefer'] = retDict[AUI]['source'][source]
					break
			if retDict[AUI]['preferSource'] == MeSH_SOURCE:
				retDict[AUI]['confidence'] = 4
			elif retDict[AUI]['preferSource'] == ICIBA_SOURCE:
				retDict[AUI]['confidence'] = 2
			else:
				retDict[AUI]['confidence'] = 1

		auiLists = json.load(open(self.AUI_JSON))
		termList = json.load(open(self.UMLS_ENG_JSON))
		baiduCNSList = json.load(open(self.UMLS_BAIDU_JSON))
		googleCNSList = json.load(open(self.UMLS_GOOGLE_JSON))
		self.addSource(retDict, BAIDU_SOURCE, self.getMapDictForMT(auiLists, termList, baiduCNSList))
		self.addSource(retDict, GOOGLE_SOURCE, self.getMapDictForMT(auiLists, termList, googleCNSList))

		analyzer = StandardAnalyzer()
		for AUI, item in tqdm(retDict.items()):
			if not (BAIDU_SOURCE in item['source'] and GOOGLE_SOURCE in item['source']):
				continue
			baiduTerm = item['source'][BAIDU_SOURCE]
			googleTerm = item['source'][GOOGLE_SOURCE]
			wordListB = analyzer.split(baiduTerm)
			wordListG = analyzer.split(googleTerm)
			if 'confidence' not in retDict[AUI]:
				retDict[AUI]['confidence'] = 6
			if retDict[AUI]['confidence'] > 2 and tokenEqualList(wordListB, wordListG) and containCNS(baiduTerm):
				retDict[AUI]['confidence'] = 3
				retDict[AUI]['preferSource'] = BAIDU_SOURCE
				retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE]

			if retDict[AUI]['confidence'] > 4 and bagOfWordsEqualList(wordListB, wordListG) and containCNS(baiduTerm):
				retDict[AUI]['confidence'] = 5
				retDict[AUI]['preferSource'] = BAIDU_SOURCE
				retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE]

			if retDict[AUI]['confidence'] == 6:
				if negENGCount(wordListB) >= negENGCount(wordListG):
					retDict[AUI]['preferSource'] = BAIDU_SOURCE
					retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE]
				else:
					retDict[AUI]['preferSource'] = GOOGLE_SOURCE
					retDict[AUI]['prefer'] = retDict[AUI]['source'][GOOGLE_SOURCE]

		print('making statistics...')
		countDict = {i:0 for i in range(1, 7)}
		for AUI in retDict:
			countDict[retDict[AUI]['confidence']] += 1
		for confidence in countDict:
			print('confidence %d: %d/%d, %f' % (confidence, countDict[confidence], len(retDict), countDict[confidence] * 1.0 / len(retDict)))
		self.printManualCUICoverRate(retDict)
		self.printManualAUICoverRate(retDict)

		return retDict