def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 termList = [term for term in termList if not isDescription(term)] # 去掉叙述性的语句 temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = [subword.strip() for subword in re.split('[,,]', term)] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = [ term for term in termList if len(term) > 1 and len(term) < 20 and containCNS(term) ] # 长度大于1 且 长度小于20 且 包含中文 termList = [term for term in termList if re.match('^见[\w]+', term) is None] termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def genSegmentTermList(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 print('size={}'.format(len(termList))) termList = [ term for term in tqdm(termList) if len(term) > 1 and containCNS(term) ] # 初筛: 长度大于1 且 包含中文 print('size={}'.format(len(termList))) temp = [] # 提取括号内容 for term in tqdm(termList): newTerm, bracketTerm = removeBracket(term) temp.append(newTerm) temp.extend(bracketTerm) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = re.split('[,;;]', term) # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号) subList = [subword.strip() for subword in subList] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in tqdm(termList) if not allDigit(term)] # 去掉纯数字 termList = [ term for term in tqdm(termList) if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) termList = [ term for term in tqdm(termList) if len(term) > 1 and len(term) < 20 and allCNS(term) ] # 终筛: 长度大于1 且 长度小于20 且 全中文 print('size={}'.format(len(termList))) termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList) # 按长度排序
def genSnomedctSegment(termList): termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符 temp = [] # 提取括号内容 for term in tqdm(termList): newTerm, bracketTerm = removeBracket(term) temp.append(newTerm) temp.extend(bracketTerm) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符 print('size={}'.format(len(termList))) temp = [] # 根据标点拆分词条 for term in tqdm(termList): subList = [subword.strip() for subword in re.split('[,,::]', term)] temp.extend(subList) termList = uniqueList(temp) print('size={}'.format(len(termList))) termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词 termList = [term for term in termList if not allDigit(term)] # 去掉纯数字 termList = [ term for term in termList if not containUselessDigitTerm(term) ] # 去掉包含时间词、温度词的词 print('size={}'.format(len(termList))) termList = [ term for term in termList if len(term) > 1 and containCNS(term) ] # 长度大于1 且 包含中文 print('size={}'.format(len(termList))) termList = removeStopwords(termList) # 去除停用词 termList = uniqueList(termList) # 去重 print('size={}'.format(len(termList))) return sortedByLength(termList)
def genMT(self): allMapDict = {} prtMapperList = [ UMLSCHIMapGenerator(), HPOMapGenerator(), SNOMEDMapGenerator1(), ICD10MapGenerator(), ICIBAMapGenerator(), SNOMEDMapGenerator2(), MeSHMapGenerator() ] for mapper in prtMapperList: mapDict = mapper.getAUIMapDict() allMapDict[mapper.source] = mapDict print('{}: {}'.format(mapper.source, len(mapDict))) retDict = {} # for source, mapDict in allMapDict.items(): self.addSource(retDict, source, mapDict) auiMapUmlsTerm = self.getAUIMapUmlsTerm() self.addUmlsTerm(retDict, auiMapUmlsTerm) for AUI, item in retDict.items(): for mapper in prtMapperList: # 采用了mapDict的翻译 source = mapper.source if source in retDict[AUI]['source']: retDict[AUI]['preferSource'] = source retDict[AUI]['prefer'] = retDict[AUI]['source'][source] break if retDict[AUI]['preferSource'] == MeSH_SOURCE: retDict[AUI]['confidence'] = 4 elif retDict[AUI]['preferSource'] == ICIBA_SOURCE: retDict[AUI]['confidence'] = 2 else: retDict[AUI]['confidence'] = 1 auiLists = json.load(open(self.AUI_JSON)) termList = json.load(open(self.UMLS_ENG_JSON)) baiduCNSList = json.load(open(self.UMLS_BAIDU_JSON)) googleCNSList = json.load(open(self.UMLS_GOOGLE_JSON)) self.addSource(retDict, BAIDU_SOURCE, self.getMapDictForMT(auiLists, termList, baiduCNSList)) self.addSource(retDict, GOOGLE_SOURCE, self.getMapDictForMT(auiLists, termList, googleCNSList)) analyzer = StandardAnalyzer() for AUI, item in tqdm(retDict.items()): if not (BAIDU_SOURCE in item['source'] and GOOGLE_SOURCE in item['source']): continue baiduTerm = item['source'][BAIDU_SOURCE] googleTerm = item['source'][GOOGLE_SOURCE] wordListB = analyzer.split(baiduTerm) wordListG = analyzer.split(googleTerm) if 'confidence' not in retDict[AUI]: retDict[AUI]['confidence'] = 6 if retDict[AUI]['confidence'] > 2 and tokenEqualList(wordListB, wordListG) and containCNS(baiduTerm): retDict[AUI]['confidence'] = 3 retDict[AUI]['preferSource'] = BAIDU_SOURCE retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE] if retDict[AUI]['confidence'] > 4 and bagOfWordsEqualList(wordListB, wordListG) and containCNS(baiduTerm): retDict[AUI]['confidence'] = 5 retDict[AUI]['preferSource'] = BAIDU_SOURCE retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE] if retDict[AUI]['confidence'] == 6: if negENGCount(wordListB) >= negENGCount(wordListG): retDict[AUI]['preferSource'] = BAIDU_SOURCE retDict[AUI]['prefer'] = retDict[AUI]['source'][BAIDU_SOURCE] else: retDict[AUI]['preferSource'] = GOOGLE_SOURCE retDict[AUI]['prefer'] = retDict[AUI]['source'][GOOGLE_SOURCE] print('making statistics...') countDict = {i:0 for i in range(1, 7)} for AUI in retDict: countDict[retDict[AUI]['confidence']] += 1 for confidence in countDict: print('confidence %d: %d/%d, %f' % (confidence, countDict[confidence], len(retDict), countDict[confidence] * 1.0 / len(retDict))) self.printManualCUICoverRate(retDict) self.printManualAUICoverRate(retDict) return retDict