예제 #1
0
def wordSemanticSimilarityGraph(fileName, path):
    print('------当前进行语义相似度网络构建操作------')
    # 图G的顶点集合
    wordsStatisticsData, wordsData = textPreprocessing.word_segmentation(
        fileName, path)
    # 词语编码的统计
    wordsEncodingData = {}
    for word in wordsData:
        code = wordEncoding(word)
        wordsEncodingData[word] = code
    # 语义相似度阈值
    b = 0.66
    graphDatas = {}
    for startWord in wordsData:
        graphData = {}
        for endWord in wordsData:
            # 若两个单词不同,则计算其语义相似度,大于指定阈值,则生成边
            if startWord != endWord:
                # 计算语义相似度
                sim = wordSemanticSimilarity(startWord, endWord,
                                             wordsEncodingData)
                if sim > b:
                    graphData[endWord] = sim
        if graphData:
            graphDatas[startWord] = graphData
    return graphDatas
예제 #2
0
파일: main.py 프로젝트: jiniaoxu/NLP-1
def SemankeyWord(content, title, skenum=None):
    content = extract_html(content)
    # 逻辑结构
    # 1、文本预处理(分词与词性标注、词语过滤、词语相关信息记录)
    wordsStatisticsData, wordsData = textPreprocessing.word_segmentation(
        content, title)
    # 2、词语语义贡献值计算(计算词语语义相似度、构建词语语义相思网络、计算词语居间度密度)
    intermediaryDensity = semanticsCount.intermediaryDegreeDensity(
        content, title)
    # 3、计算词语统计特征值
    # keywordDatas = statisticsCount.tfidf()
    wordsStatisticsData = statisticsCount.wordsStatistics(wordsStatisticsData)
    # 4、计算词语关键度
    # 算法基础设定
    # 语义贡献值权重
    vdw = 0.6
    # 统计特征值权重
    tw = 0.4
    # 统计特征位置上权重
    locw1, locw2, locw3 = 0.5, 0.3, 0.3
    # 统计特征词长权重
    lenw = 0.01
    # 统计特征值中词性权重
    posw = 0.5
    # 统计特征中TF-IDF权重
    tfidfw = 0.8

    # 对收集到的词语进行重新遍历
    ske = {}
    for key in wordsStatisticsData.keys():
        # 取语义贡献值(假如居间度密度集合中不存在,补充为0)
        if intermediaryDensity.get(key):
            vdi = intermediaryDensity.get(key)
        else:
            vdi = 0

        # 暂时未加tfidf权值
        score = vdw * vdi + tw * (locw1 * float(wordsStatisticsData[key][0]) +
                                  lenw * int(len(key)) +
                                  posw * float(wordsStatisticsData[key][1]))
        ske[key] = score

    ske = sorted(ske.items(), key=lambda d: d[1], reverse=True)  # 降序排列
    skelen = len(ske)
    if skenum is None:
        ske = ske[:math.ceil(skelen / 3)]
    else:
        ske = ske[:skenum]
    words = [word for word, _ in ske]
    words = ",".join(words)
    return words
예제 #3
0
def main(fileName, path):
    # 逻辑结构
    # 1、文本预处理(分词与词性标注、词语过滤、词语相关信息记录)
    print '------当前文本预处理操作------'
    wordsStatisticsData, wordsData = textPreprocessing.word_segmentation(
        fileName, path)
    # 2、词语语义贡献值计算(计算词语语义相似度、构建词语语义相思网络、计算词语居间度密度)
    print '------当前进行词语语义贡献值计算操作------'
    intermediaryDensity = semanticsCount.intermediaryDegreeDensity(
        fileName, path)
    # 3、计算词语统计特征值
    # keywordDatas = statisticsCount.tfidf()
    print '------当前进行词语统计特征值计算操作------'
    wordsStatisticsData = statisticsCount.wordsStatistics(wordsStatisticsData)
    print '------当前进行汇总计算操作------'
    # 4、计算词语关键度
    # 算法基础设定
    # 语义贡献值权重
    vdw = 0.6
    # 统计特征值权重
    tw = 0.4
    # 统计特征位置上权重
    locw1, locw2, locw3 = 0.5, 0.3, 0.3
    # 统计特征词长权重
    lenw = 0.01
    # 统计特征值中词性权重
    posw = 0.5
    # 统计特征中TF-IDF权重
    tfidfw = 0.8

    # 对收集到的词语进行重新遍历
    ske = {}
    for key in wordsStatisticsData.keys():
        # 取语义贡献值(假如居间度密度集合中不存在,补充为0)
        if intermediaryDensity.get(key):
            vdi = intermediaryDensity.get(key)
        else:
            vdi = 0

        # 暂时未加tfidf权值
        score = vdw * vdi + tw * (locw1 * float(wordsStatisticsData[key][0]) +
                                  lenw * int(len(key)) +
                                  posw * float(wordsStatisticsData[key][1]))
        ske[key] = score

    ske = sorted(ske.iteritems(), key=lambda d: d[1], reverse=True)  # 降序排列
    # print json.dumps(ske, ensure_ascii=False)
    return ske