postTitle = postDict[postID] cursor.execute("select post_content from wp_posts where ID=" + str(postID) + ";") data = cursor.fetchone() postContent = data[0] mode = '<div class="mw-highlight mw-content-ltr" dir="ltr">([\S\s]*?)</div>' postContent = re.sub(mode, "", postContent) if postContent == "": del postDict[postID] delete += 1 else: postContent = PyQuery(postContent) postContent = postContent.text() postIDToPostContentDict[postID] = postContent for wikiEntryID, wikiEntryTitle in wikiEntryDict.iteritems(): if postContent.count(wikiEntryTitle) != 0: count += 1 if postID in postIDToWikiEntryIDDict: postIDToWikiEntryIDDict[postID].append(wikiEntryID) else: postIDToWikiEntryIDDict[postID] = [wikiEntryID] if wikiEntryID in wikiEntryIDToPostIDDict: wikiEntryIDToPostIDDict[wikiEntryID].append(postID) else: wikiEntryIDToPostIDDict[wikiEntryID] = [postID] if postID in postIDToPostContentDict: if postID in postIDToWikiEntryIDDict and len( postIDToWikiEntryIDDict[postID]) > maxEntry: maxEntry = len(postIDToWikiEntryIDDict[postID]) seg_list_generator = jieba.cut(postContent, cut_all=False) seg_list = list(seg_list_generator)