def fetch_segwords(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    sql = "SELECT t1.keyword_segmented,t2.seqno \
            FROM pzbase.ai_keywords_classification_train t1 \
            inner join pzbase.ai_keywords_classification_classdef t2 \
                    on t1.class_level1=t2.class_level1 and t1.class_level2=t2.class_level2 and t1.class_level3=t2.class_level3 and t2.search_word_flag=1 \
            where t1.proc_flag=1 and t1.keyword not in (select keyword from pzbase.ai_keywords_classification_test where predict_method='bayes')"

    rst1 = mysql.getAll(sql)

    # sql = "SELECT t1.keyword_segmented,t2.seqno \
    #         FROM pzbase.ai_keywords_classification_train t1 \
    #         inner join pzbase.ai_keywords_classification_classdef t2 \
    #                 on t1.class_level1=t2.class_level1 and t1.class_level2=t2.class_level2 and t1.class_level3=t2.class_level3 and t2.search_word_flag=1 \
    #         where t1.proc_flag=1 and t1.keyword in (select keyword from pzbase.ai_keywords_classification_test where predict_method='')"
    # rst2 = mysql.getAll(sql)

    sql = "SELECT keyword_segmented,seqno FROM pzbase.ai_keywords_classification_test where predict_method='bayes'"
    rst2 = mysql.getAll(sql)
    mysql.dispose()

    loginfo = ' %d, %d segmented keywords and classIDs are fetched.' % (
        len(rst1), len(rst2))
    gl.write_log(logpath, 'info', loginfo)
    return rst1, rst2
예제 #2
0
def load_wordVectors(model_filepath):
    loginfo = 'loding question word2ven model...'
    gl.write_log(logpath, 'info', loginfo)

    model = word2vec.Word2Vec.load(model_filepath)
    vocab = list(model.wv.vocab.keys())  # 所有的单词
    return model, vocab
예제 #3
0
def index_search(query_words, title_number):
    if not title_number:
        title_number = get_title_number

    # words_results: [<Top 5 Results for Term('segwords', '美团') runtime=...>, <Top 5 Results for Term('segwords', '汽车') runtime=...>]
    words_results = tml.query_index(query_words, int(title_number),
                                    index_searcher, query_parser, logpath)

    i = 0
    j = 0
    words_results_dict = {}
    for word_results in words_results:  # get results for each segmented query word
        # print('word_results: ', word_results)
        word_result_dict = {}
        for word_result in word_results:
            word_result_id = {}
            word_result_id['source'] = word_result.get(key="source")
            word_result_id['ad_title'] = word_result.get(key="title")
            word_result_id['segmented_words'] = word_result.get(key="segwords")
            word_result_dict[word_result.get(key="seq_no")] = word_result_id
            j += 1
        words_results_dict[query_words[i]] = word_result_dict
        i += 1

    loginfo = ' query words: %s, result number: %d.' % (query_words, j)
    gl.write_log(logpath, 'info', loginfo)
    return words_results_dict
예제 #4
0
def load_dicts(path, logpath):
    jieba.load_userdict(path)
    # 动态调整词频,让未登录词的词频自动靠前,这样可以优先匹配
    [jieba.suggest_freq(line.strip(), tune=True) for line in open(path, 'r', encoding='utf8')]

    loginfo = ' User dict %s has beed loaded.' % path
    gl.write_log(logpath, 'info', loginfo)
def get_similarity_words(model, query_word, number):
    similarity_words = ''
    try:
        similarity_words = model.most_similar(query_word, topn=number)
    except KeyError:
        loginfo = ' The word is not in vocabulary!'
        gl.write_log(logpath, 'error', loginfo)
    return similarity_words
def fetch_segwords(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    sql = "SELECT ad_title_segwords FROM %s where ad_title_segwords is not null" % tablename
    rst = mysql.getAll(sql)
    mysql.dispose()

    loginfo = ' %d rows are fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
def fetch_segwords(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    sql = ''.join(['SELECT question_stem_segment FROM ', tablename, " where is_segmented=1", ])
    rst = mysql.getAll(sql)
    mysql.dispose()

    loginfo = ' %d rows have been fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
def incrementally_build_model(original_modelpath, new_model_filepath, sentences):
    model = word2vec.Word2Vec.load(original_modelpath)
    model.build_vocab(sentences, update=True)
    print(model.corpus_count, model.iter)
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    model.save(new_model_filepath)

    loginfo = ' word2vec model %s has been built incrementally based on %s!' % (new_model_filepath, original_modelpath)
    gl.write_log(logpath, 'info', loginfo)
예제 #9
0
def calculate_similarity(model, word1_list, word2_list):
    similarity = -1
    try:
        similarity = model.n_similarity(word1_list, word2_list)
    except KeyError:
        loginfo = ' The words similarity is not available!'
        gl.write_log(logpath, 'error', loginfo)

    return similarity
예제 #10
0
def fetch_dest_segwords(tablename, field_value):
    mysql = msc.MyPymysqlPool("dbMysql")
    sql = ''.join(['SELECT question_seqno, question_stem, question_stem_segment_clear FROM ', \
                   tablename, " where is_segmented=1 and in_tablename='", field_value, "'", ])
    rst = mysql.getAll(sql)
    mysql.dispose()

    loginfo = ' %d matched questions have been fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
예제 #11
0
def get_stopwords(path, logpath):
    stopwords = []
    with open(path, "r", encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())

    loginfo = ' Stop words dict %s has beed loaded.' % path
    gl.write_log(logpath, 'info', loginfo)
    return stopwords
def get_records(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    sql = ''.join(['SELECT question_seqno, upper(question_stem) as question_stem FROM ', \
                   tablename, " where is_segmented=0", ])
    rst = mysql.getAll(sql)
    mysql.dispose()

    loginfo = ' %d rows are fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
def incrementally_build_model(model_path, sentences):
    model = word2vec.Word2Vec.load(model_path)
    model.build_vocab(sentences, update=True)
    print(model.corpus_count, model.iter)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)
    model.save(model_path)

    loginfo = ' model %s has been built incrementally!' % model_path
    gl.write_log(logpath, 'info', loginfo)
def get_titles(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    # upper chars for general match, Boss直聘=BOSS直聘=boss直聘
    sql = "SELECT seq_no, upper(ad_title) as ad_title FROM %s where proc_flag is null or proc_flag<>1" % tablename
    rst = mysql.getAll(
        sql
    )  # [{'seq_no': 97, 'ad_title': '小说看到一半要花钱?这里让你免费看完大结局!'}, {'seq_no': 98, 'ad_title': '玄幻大神天蚕土豆亲授逆袭攻略'}]
    mysql.dispose()

    loginfo = ' %d titles are fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
예제 #15
0
def get_titles(tablename):
    mysql = msc.MyPymysqlPool("dbMysql")
    # upper chars for general match, Boss=BOSS=boss
    # sql = "SELECT seqno, upper(keyword) as keyword FROM %s where proc_flag=0 order by seqno" % tablename
    sql = "SELECT seqno, upper(keyword) as keyword FROM %s order by seqno" % tablename
    rst = mysql.getAll(
        sql
    )  # [{'seqno': 97, 'keyword': '小说看到一半要花钱?这里让你免费看完大结局!'}, {'seqno': 98, 'keyword': '玄幻大神天蚕土豆亲授逆袭攻略'}]
    mysql.dispose()

    loginfo = ' %d titles are fetched.' % len(rst)
    gl.write_log(logpath, 'info', loginfo)
    return rst
def create_index(index_dir):
    loginfo = 'inverted index are creating...'
    gl.write_log(logpath, 'info', loginfo)

    index_dir_writen, index_dir_processing = check_index_directory(index_dir)

    # create inverted index for all data because of not too many titles
    write_index_file(index_dir_writen, tablename)

    if index_dir_writen == index_dir_processing:
        shutil.rmtree(index_dir)
        os.rename(index_dir_writen, index_dir)
        loginfo = 'Inverted index directory %s has been renamed.' % index_dir
        gl.write_log(logpath, 'info', loginfo)
def build_model(model_path,
                sentences,
                embedding_size=128,
                in_window=5,
                in_min_count=5):
    w2vModel = word2vec.Word2Vec(sentences,
                                 sg=1,
                                 size=embedding_size,
                                 window=in_window,
                                 min_count=in_min_count,
                                 workers=multiprocessing.cpu_count())
    w2vModel.save(model_path)
    loginfo = ' model %s has been built initially!' % model_path
    gl.write_log(logpath, 'info', loginfo)
def segment_title(titles, tablename):
    i = 0
    for title in titles:
        seq_no = title["seq_no"]
        ad_title = title["ad_title"]
        segmented_words = tml.words_segment(ad_title, stopwords, GOOD_WORDS)
        words = re.sub(CUT_WORDS, '', str(segmented_words))
        write_segmented_words(tablename, seq_no, words)
        if i % 1000 == 0:
            loginfo = ' progress status: %d ' % i
            gl.write_log(logpath, 'info', loginfo)
        i += 1
    loginfo = ' Total %d titles\'s segmented words have been writen.' % i
    gl.write_log(logpath, 'info', loginfo)
def segment_questions(records, tablename):
    i = 0
    for record in records:
        seqno = record["question_seqno"]
        stemwords = record["question_stem"]
        segmented_words = tml.words_segment(stemwords, stopwords, GOOD_WORDS, HMM=True) # 实际检测有HMM效果更好
        segwords = re.sub(CUT_WORDS, '', str(segmented_words))
        segwords = tml.iterate_replacements(segwords, '\\', '')
        segwords = tml.iterate_replacements(segwords, '  ', ' ')   # 将segwords中多个空格迭代替换为1个空格
        write_segmented_words(tablename, seqno, stemwords, segwords)
        if i % 500 == 0:
            loginfo = ' progress status: %d ' % i
            gl.write_log(logpath, 'info', loginfo)
        i += 1
    loginfo = ' Total %d keyword\'s segmented words have been writen.' % i
    gl.write_log(logpath, 'info', loginfo)
def check_index_directory(index_dir):
    # 考虑到索引创建时间可能会比较长,为了保证创建索引时索引文件依然可以访问,
    # 如果已有索引,则先存放到后缀_processing目录中,待索引生成后,直接进行替换
    index_dir_processing = index_dir + "_processing"
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
        index_dir_writen = index_dir
        loginfo = '     Inverted index directory %s has been created.' % index_dir
        gl.write_log(logpath, 'info', loginfo)
    else:
        if os.path.exists(index_dir_processing):
            shutil.rmtree(index_dir_processing)
        os.mkdir(index_dir_processing)
        loginfo = '     Temporary inverted index directory %s has been created.' % index_dir_processing
        gl.write_log(logpath, 'info', loginfo)
        index_dir_writen = index_dir_processing

    return index_dir_writen, index_dir_processing
예제 #21
0
def segment_title(titles, tablename):
    i = 0
    for title in titles:
        seqno = title["seqno"]
        keyword = title["keyword"]
        segmented_words = tml.words_segment(keyword,
                                            stopwords,
                                            GOOD_WORDS,
                                            iscutall=False)
        # segmented_words = tml.words_segment(keyword, stopwords, GOOD_WORDS, iscutall=True)
        words = re.sub(CUT_WORDS, '', str(segmented_words))
        write_segmented_words(tablename, seqno, words)
        if i % 1000 == 0:
            loginfo = ' progress status: %d ' % i
            gl.write_log(logpath, 'info', loginfo)
        i += 1
    loginfo = ' Total %d keyword\'s segmented words have been writen.' % i
    gl.write_log(logpath, 'info', loginfo)
예제 #22
0
def insert_question_similarity(model, vocab, dest_records, base_records,
                               threshold, tablename):
    mysql = msc.MyPymysqlPool("dbMysql")

    # 计算rst中两两记录之间的相似度,将大于阈值的存入question_similarity_table
    i = 0
    j = 0
    for dest_record in dest_records:
        # mysql.begin()  # 开启事务
        seqno1 = dest_record[0]
        stem1 = dest_record[1]
        segwords1 = dest_record[2]
        segwords1_list = segwords1.split()
        max_similarity = -1
        max_seqno = -1
        for base_record in base_records:
            seqno2 = int(base_record[0])
            stem2 = base_record[1]
            segwords2 = base_record[2]
            segwords2_list = segwords2.split()

            if len(segwords1_list) > 0 and len(segwords2_list) > 0:
                similarity = calculate_similarity(model, segwords1_list,
                                                  segwords2_list)
                if similarity > max_similarity:
                    max_similarity = similarity
                    max_seqno = seqno2
                if (similarity >= threshold):
                    sql = ''.join(['insert into ', tablename,
                                   "(question_seqno1,question_seqno2,question_stem1,question_stem2," \
                                   "question_stem_segment1,question_stem_segment2,similarity,load_time)" \
                                   " values(", str(seqno1), ",", str(seqno2), ",'", str(stem1), "','", str(stem2),
                                   "','", str(segwords1), "','", str(segwords2), "',", str(similarity), \
                                   ", CURRENT_TIMESTAMP());"])
                    mysql.insert(sql)
                    i += 1
        j += 1
        print('seqno1, max_seqno, max_similarity: ', seqno1, max_seqno,
              max_similarity)
        mysql.end()  # 结束提交
    mysql.dispose()
    loginfo = '%d similar rows have been inserted into %s!' % (i, tablename)
    gl.write_log(logpath, 'info', loginfo)
def write_index_file(index_dir, tablename):
    analyzer = ChineseAnalyzer(minsize=1)  # can index one word
    schema = Schema(seq_no=NUMERIC(stored=True),
                    source=TEXT(stored=True),
                    title=TEXT(stored=True),
                    segwords=TEXT(stored=True, analyzer=analyzer))
    ix = create_in(index_dir, schema)
    writer = ix.writer()

    datasets = fetch_segwords(tablename)
    for dataset in datasets:
        get_seq_no = int(dataset["seq_no"])
        get_source = dataset["ad_title_source"]
        get_title = dataset["ad_title"].replace('\n', '')
        get_segwords = dataset["ad_title_segwords"].replace('\n', '')
        writer.add_document(seq_no=get_seq_no,
                            source=get_source,
                            title=get_title,
                            segwords=get_segwords)
    writer.commit()
    loginfo = 'Inverted index for %s has been created.' % tablename
    gl.write_log(logpath, 'info', loginfo)
                      help="the train/test table name",
                      default='pzbase.ai_keywords_classification_test')

    args = args.parse_args()
    args_dict = args.__dict__
    return args_dict


if __name__ == '__main__':
    global logpath

    args_dict = comand_line_set()
    tablename = args_dict.get("tablename")
    logpath = args_dict.get("logpath")

    gl.write_log(logpath, 'info', '\n\n')
    loginfo = 'segwords vectorization starting...'
    gl.write_log(logpath, 'info', loginfo)

    # get segmeted keywords
    rst1, rst2 = fetch_segwords(tablename)
    word2vec_vectorizer(rst1, rst2)
    exit()

    # tfidf_vec_trainX, tfidf_vec_testX, trainy, testy = tfidf_vectorizer1(rst1, 0.1)
    # vec_trainX, vec_testX, trainy, seqno_test = tfidf_vectorizer2(rst1, rst2)
    vec_trainX, vec_testX, trainy, seqno_test = tf_vectorizer(rst1, rst2)
    model = nativebayes_model_train(vec_trainX, trainy)

    # joblib.dump((tfidf_vec_trainX, tfidf_vec_testX, trainy, seqno_test), 'vec_data.pkl'.format(), compress=3)
    # joblib.dump(model, 'vec_model.pkl'.format(), compress=3)
예제 #25
0
    global get_title_number
    global get_similar_number
    global index_searcher
    global query_parser

    args_dict = comand_line_set()
    index_path = args_dict.get("indexpath")
    logpath = args_dict.get("logpath")
    user_dict_path = args_dict.get("userdictpath")
    comp_dict_path = args_dict.get("compdictpath")
    stop_word_path = args_dict.get("stopwordpath")
    modelpath = args_dict.get("modelpath")
    get_title_number = args_dict.get("titlenumber")
    get_similar_number = args_dict.get("similarnumber")

    gl.write_log(logpath, 'info', "\n\n")
    loginfo = ' word retrieval service starting...'
    gl.write_log(logpath, 'info', loginfo)

    # preload dicts to save running time
    tml.load_dicts(user_dict_path, logpath)
    tml.load_dicts(comp_dict_path, logpath)
    stopwords = tml.get_stopwords(stop_word_path, logpath)

    ix = open_dir(index_path)  # for read only
    index_searcher = ix.searcher()
    query_parser = QueryParser("segwords", schema=ix.schema)
    loginfo = ' inverted index file %s has been opened.' % index_path
    gl.write_log(logpath, 'info', loginfo)

    # preload similar model to save running time