Пример #1
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符

    temp = []  # 根据标点拆分词条, 取第一段
    for term in tqdm(termList):
        temp.append(re.split('[,,]', term, maxsplit=1)[0])
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Пример #2
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList
                if not isDescription(term)]  # 去掉叙述性的语句

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and containCNS(term)
    ]  # 长度大于1 且 长度小于20 且 包含中文
    termList = [term for term in termList if re.match('^见[\w]+', term) is None]

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Пример #3
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList) if len(term) > 1 and containCNS(term)
    ]  # 初筛: 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = re.split('[,;;]', term)  # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号)
        subList = [subword.strip() for subword in subList]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in tqdm(termList) if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in tqdm(termList) if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList)
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 全中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)  # 按长度排序
Пример #4
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 仅包含中文

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Пример #5
0
def genSegmentTermList(termList):
	termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符

	termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符

	temp = []   # 根据标点拆分词条
	for term in tqdm(termList):
		subList = [subword.strip() for subword in re.split('[,,]', term)]
		temp.extend(subList)
	termList = uniqueList(temp)
	print('size={}'.format(len(termList)))

	termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词
	termList = [term for term in termList if not allDigit(term)]   # 去掉纯数字
	termList = [term for term in termList if not containUselessDigitTerm(term)] # 去掉包含时间词、温度词的词
	termList = [term for term in termList if not (len(term) == 1 and not isCNS(term))]  # 去掉非汉字单字

	termList = removeStopwords(termList)    # 去除停用词
	termList = uniqueList(termList)
	print('size={}'.format(len(termList)))
	return sortedByLength(termList)
Пример #6
0
def genSnomedctSegment(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,::]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    print('size={}'.format(len(termList)))

    termList = [
        term for term in termList if len(term) > 1 and containCNS(term)
    ]  # 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)