Exemplo n.º 1
0
def addEdge(db, G, id_list, id_):
    db_lock.acquire()
    pointingTo_res = db.executeSelectSql(
        "SELECT pointing FROM url_tb WHERE id=%s", (id_, ))
    db_lock.release()
    #         print(pointingTo_res)
    if pointingTo_res and pointingTo_res[0] and pointingTo_res[0][0]:
        pointingTo = pointingTo_res[0][0]
        pointing_num_set = set(
            [int(x) for x in pointingTo.split(",") if int(x) in id_list])  # 获得
        #             print(pointing_num_set)
        for pointing_num_id in pointing_num_set:
            G.add_edge(id_, pointing_num_id)
    print(id_)
Exemplo n.º 2
0
def saveIndexToDb(db, id_, word_dict):
    # 这里的word_dict是对应每一页的单词的倒排索引,因此docId是唯一的
    re_id = re.compile("{0}/\d*".format(id_))
    for word in word_dict.keys():
        # 对于词典中的每一个词
        db_lock.acquire()
        res_wordIndex = db.executeSelectSql(
            "SELECT result FROM dict_tb1 WHERE word=%s", (word, ))
        db_lock.release()

        if res_wordIndex and res_wordIndex[0] and res_wordIndex[0][
                0]:  # 有这个单词的倒排索引
            wordIndex = res_wordIndex[0][0]
            mat = re_id.search(wordIndex)
            if mat != None:
                # 该单词的倒排索引中已经记录该docId
                continue
            else:
                wordIndex += ",{0}/{1}".format(id_, word_dict[word])
            db_lock.acquire()
            db.executeUpdateSql("UPDATE dict_tb1 SET result=%s WHERE word=%s",
                                (wordIndex, word))
            db_lock.release()
        else:  # 没有这个单词的倒排索引
            wordDict_str = "{0}/{1}".format(id_, word_dict[word])
            db_lock.acquire()
            db.executeUpdateSql(
                "INSERT INTO dict_tb1 (word,result) VALUES(%s,%s)",
                (word, wordDict_str))
            db_lock.release()
    print("向数据库插入一个word_dict:" + str(id_))
Exemplo n.º 3
0
def parseOnePageHtml(db, id):
    print("%s" % (id))
    db_lock.acquire()
    res_content = db.executeSelectSql("SELECT content from url_tb WHERE id=%s",
                                      (id, ))
    res_url = db.executeSelectSql("SELECT url from url_tb WHERE id=%s", (id, ))
    db_lock.release()
    url_num = []
    # print(res_content)
    if res_content and res_content[0] and res_content[0][
            0] and res_url and res_url[0] and res_url[0][0]:
        url_list = parseHrefFromHtml(res_url[0][0], res_content[0][0])
        print(url_list)
        for url in url_list:
            # 对于链接中的每一个
            # print(url)
            db_lock.acquire()
            url_id_res = db.executeSelectSql(
                "SELECT id FROM url_tb WHERE url=%s", (url, ))
            db_lock.release()
            #                 print(url_id_res)
            if url_id_res and url_id_res[0] and url_id_res[0][0]:
                url_id = url_id_res[0][0]
                url_num.append(str(url_id))

    new_pointing = ','.join(url_num)
    print(new_pointing)
    db_lock.acquire()
    db.executeUpdateSql("UPDATE url_tb SET pointing=%s WHERE id=%s",
                        (new_pointing, id))
    db_lock.release()
Exemplo n.º 4
0
def buildOnePage(db, id_):
    print("开始建立第{0}页的分词索引".format(id_))
    word_dict = {}
    db_lock.acquire()
    res_content = db.executeSelectSql("SELECT content FROM url_tb WHERE id=%s",
                                      (id_, ))
    db_lock.release()
    if res_content and res_content[0] and res_content[0][0]:
        html = res_content[0][0]
        title = getTitleFromHtml(html)
        content = rm_tags(html)
        saveTitleTextToDb(db, content, title, id_)
        seglist = jieba.cut_for_search(content)
        #         print(",".join(seglist))
        for word in seglist:
            if word_dict.get(word) is None:
                # 在字典中
                word_dict[word] += 1
            else:
                word_dict[word] = 1
    #     print(word_dict)
    saveIndexToDb(db, id_, word_dict)
    print("单页的分词索引已建立")
Exemplo n.º 5
0
def saveTitleTextToDb(db, content, title, id_):
    db_lock.acquire()
    db.executeUpdateSql("UPDATE url_tb SET text=%s WHERE id=%s",
                        (content, id_))
    db.executeUpdateSql("UPDATE url_tb SET title=%s WHERE id=%s", (title, id_))
    db_lock.release()