def extractTheme(tagList,tagbaseFilePath):
    themeList = []
    tagbaseList = io.readListFromTxt(tagbaseFilePath)
    for item in tagList:
        if item in tagbaseList:
            themeList.append(item)
    return themeList
def filterTagFromTagbase(content,tagbaseFilePath):
    resultList = []
    # 获取标签库列表
    tagbaseList = io.readListFromTxt(tagbaseFilePath)
    for item in tagbaseList:
        if item in content:
            resultList.append(item)
    return resultList
예제 #3
0
def extractTheme(tagList,tagbaseFilePath):
    themeList = []
    tagbaseList = io.readListFromTxt(tagbaseFilePath)
    for item in tagList:
        if item not in index.TAGBASE_STOP_WORD_LIST:
            if item in tagbaseList:
                themeList.append(item)
    return themeList
def cleanTheme(tagList):
    themeList = []
    # 获取标签库中标签
    filePath = io.getSourceFilePath('tagbase.txt')
    tagbaseList = io.readListFromTxt(filePath)
    for item in tagList:
        if item in tagbaseList:
            themeList.append(item)
    return themeList
def updateTagbase():
    '''
        作为一个单独模块,对tagbase.txt进行调整
    '''
    # 对标签库进行了去重操作
    tagbaseFilePath = io.getSourceFilePath('tagbase.txt')

    tagbaseList = io.readListFromTxt(tagbaseFilePath)   # 68638
    cleanTagbaseList = list(set(tagbaseList))   # 67523
    io.writeList2Txt('tagbase.txt',cleanTagbaseList)
def scanTheme2Tag(themeList,tagbaseFilePath):
    '''
        从标签库中筛选标签
    '''
    tagList = []
    tagbaseList = io.readListFromTxt(tagbaseFilePath)
    for item in themeList:
        if item in tagbaseList:
            tagList.append(item)
    return tagList
 inputFilePath = io.getSourceFilePath('investEvents_20161227144154.txt')
 outputFilePath = io.getSourceFilePath(
     'investEvents_taged_20161227144154.txt')
 tagbaseFilePath = io.getSourceFilePath(
     'tagbase_iron_tag_all_product_company.txt')
 newseedInfoOutputFilePath = io.getProcessedFilePath(
     'newseed_taged_info.csv')
 # get infoList
 infoList = io.loadData2Json(inputFilePath)
 # persist tagbase from redis
 tagbaseDic = util.getTagbaseDicFromRedis(initDic, tagbaseNameList)
 util.persistentTagbase(tagbaseDic, tagbaseFilePath)
 # load cut word user dict
 jieba.load_userdict(tagbaseFilePath)
 # get tagbaseList
 tagbaseList = io.readListFromTxt(tagbaseFilePath)
 # prepare for output
 fw = open(outputFilePath, 'w', encoding='utf-8')
 i = 1
 j = 0
 # traverse infoList
 for item in infoList:
     if item['startup']['productDesc']:
         productDesc = item['startup']['productDesc']
         # get cleaned desc
         cleanedDesc = getCleanedDesc(productDesc)
         # get cut word list
         cutWordList = getCutWordList(cleanedDesc)
         # extract tag
         ironTagList = extractTag(cutWordList, tagbaseList)
         print(i, 'extracted tag:', ironTagList)