示例#1
0
def getSimilarity(df_content_o):
    logging.debug('preparing docSim')
    raw_documents = list(df_content_o['content'])
    corpora_documents = []
    for item_text in raw_documents:
        item_str = item_text.split(' ')
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1
    similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!!
    similarity.num_best = max_similar_num
    return similarity,dictionary
示例#2
0
def get_docsim_feature(contents, remarks=""):

    dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks
    corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks
    corpora_documents = []
    tokenizer = Tokenizer()
    for item_text in contents:
        item_str = tokenizer(item_text)
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    similarity = Similarity('-Similarity-index', corpus, num_features=300)
    similarity.num_best = 3
    pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4)
    pickle.dump(corpus, open(corpus_path, "wb"), protocol=4)

    return similarity, corpus
示例#3
0
def doc2bow():
    file = open("questions.txt", encoding='utf8')
    corpora_documents = []
    # text1=[]
    text2 = []
    lines = file.readlines()
    print(lines)
    for line in lines:
        # print(line)
        line_strip = line.strip('\n')
        # print(line_strip)
        text2.append(line_strip)
        # print("text2:"+str(text2))

        text1 = del_stopword(line_strip)
        # text2=text1

        # for x in jieba.lcut(line):
        #     # print(x)
        #     if x not in stopwords:
        #         text1.append(x)
        # # str(text1).strip("\n")
        # text2=text1
        corpora_documents.append(text1)
        # # print(corpora_documents)
        # text1=[]
    print(corpora_documents)

    # 生成字典:Dictionary(183 unique tokens: ['\n', '品种', '贷款', '贷款期限', '申请']...)
    dictionary = corpora.Dictionary(corpora_documents)
    # #判断对应的词典向量是否存在
    if os.path.exists("E:\ITCC\mytest\dict.txt"):
        dictionary = Dictionary.load('dict.txt')  #加载
    else:
        dictionary.save('dict.txt')  #保存生成的词典
        dictionary = Dictionary.load('dict.txt')  #加载

    print(dictionary)
    # 生成向量语料:[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1)], [(0, 1), (2, 1), (4, 1), (5, 1)]...]
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    print(corpus)
    # 生成语料model便于后面使用
    if os.path.exists("corpuse.mm"):
        corpus = corpora.MmCorpus('corpuse.mm')  #加载
    else:
        corpora.MmCorpus.serialize('corpuse.mm', corpus)  #保存生成的语料
        corpus = corpora.MmCorpus('corpuse.mm')  #加载
    # 生成对应的相似度模型:max_features:最大的特征数也可以理解为维度,也就是字典中单词数的最大值为多少
    similarity = Similarity('-Similarity-index', corpus, num_features=400)
    test_data_1 = s
    test_cut_raw_1 = del_stopword(test_data_1)
    print(test_cut_raw_1)
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 5
    print(similarity[test_corpus_1]
          )  # 返回最相似的样本材料,(index_of_document, similarity) tuples
    for sample in similarity[test_corpus_1]:
        index = sample[0]
        #相似度为1直接执行精确查找返回对应的答案
        if sample[1] == 1:
            print("你的问题是:" + str(text2[int(index)]) + "相似度:" + str(sample[1]))
            break
        #相似度大于某个值则将代表该问题可能是用户想要询问的问题
        elif sample[1] >= 0.8:
            print("你要问的问题是不是:" + str(text2[int(index)]) + "相似度:" +
                  str(sample[1]))
        #否则就返回一组问题让用户挑选
        else:
            print("相似的句子:" + str(text2[int(index)]) + "相似度:" + str(sample[1]))
    #实验效果
    # for line in lines:
    #     test_data_1 = line
    #     test_cut_raw_1 =del_stopword(test_data_1)
    #     print(test_cut_raw_1)
    #     test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    #     similarity.num_best = 5
    #     print(similarity[test_corpus_1])  # 返回最相似的样本材料,(index_of_document, similarity) tuples
    print('################################')
DBName = "bullhorn"

db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True)

app = Flask(__name__)
CORS(app)

resultTuple = generateCorpus()
dictionary = resultTuple['dictionary']
corpus = resultTuple['corpus']
socTitleDict = resultTuple['socTitleDict']

num_topics = 200
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics)
gensimIndex.num_best = 3


@app.before_request
def before_request():
    db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True)
    resultTuple = generateCorpus()
    # dictionary = resultTuple['dictionary']
    # corpus = resultTuple['corpus']
    # socTitleDict = resultTuple['socTitleDict']
    #
    # num_topics = 200
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
    # gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics)
    # gensimIndex.num_best = 3
    g.gensimIndex = gensimIndex
示例#5
0
    '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
]
corpora_documents = []
for item_text in raw_documents:
    #item_str = util_words_cut.get_class_words_list(item_text)
    item_str = list(jieba.cut(item_text))
    corpora_documents.append(item_str)

# 生成字典和向量语料
dictionary = corpora.Dictionary(corpora_documents)
corpus = [dictionary.doc2bow(text) for text in corpora_documents]

similarity = Similarity('-Similarity-index', corpus, num_features=400)

test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚'
#test_cut_raw_1 = util_words_cut.get_class_words_list(test_data_1)
test_cut_raw_1 = list(jieba.cut(test_data_1))
test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
similarity.num_best = 5
print(similarity[test_corpus_1]
      )  # 返回最相似的样本材料,(index_of_document, similarity) tuples

print('################################')

test_data_2 = '家人因涉嫌运输毒品被抓,她只是去朋友家探望朋友的,结果就被抓了,还在朋友家收出毒品,可家人的身上和行李中都没有。现在已经拘留10多天了,请问会被判刑吗'
#test_cut_raw_2 = util_words_cut.get_class_words_list(test_data_2)
test_cut_raw_2 = list(jieba.cut(test_data_2))
test_corpus_2 = dictionary.doc2bow(test_cut_raw_2)
similarity.num_best = 5
print(similarity[test_corpus_2]
      )  # 返回最相似的样本材料,(index_of_document, similarity) tuples
示例#6
0
row = cursor.fetchone()
while (row != None):
    print type(row[2])
    doc = row[2]
    doclist = doc.lower().split()
    vec_bow = dictionary.doc2bow(doclist)
    num_topics = 200
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
    vec_lsi = lsi[vec_bow]
    #Generate feature
    wordsId = [word[0] for word in vec_bow]
    wordsIdMap = zip(wordsId, vec_lsi)
    features = map(lambda x: dictionary.get(x[0]),
                   sorted(wordsIdMap, key=lambda x: -x[1][1]))
    gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics)
    gensimIndex.num_best = 3
    sims = gensimIndex[vec_lsi]

    for item in sims:
        socCode = socTitleDict[item[0]]
        score = item[1]
        bullhornCode = row[0]
        featuresList = " ".join(features[0:10]).replace('\'', '')
        sortedVecList = sorted(vec_lsi, key=lambda x: -x[1])
        top10feature = [str(round(vec[1], 5)) for vec in sortedVecList[0:10]]
        featureScoreStr = " ".join(top10feature)
        #Write to the database
        sql = '''
        INSERT INTO jobtitlematch (bullhorn_job_id, jobtitle_id, score, featurelist, feature_score)
        VALUES ('%d', '%d', '%f','%s', '%s');
        ''' % (bullhornCode, socCode, score, featuresList, featureScoreStr)