def load_blogs(self): ''' 读取微博数据,为训练word2vec做准备 ''' self.load_stopwords() filepath = 'blogs.txt' dbhelper = DBHelper() i = 0 size = 100000 index = 0 with open(filepath, 'a') as writer: while index < 227: blogs = dbhelper.select('SELECT mc,rmc FROM microblog ORDER BY blog_id DESC LIMIT %s,%s' % (index*size, (index+1)*size)) if not blogs: break for blog in blogs: print i i += 1 msg = '' if blog[1]: msg = blog[1] elif blog[0]: msg = blog[0] tokens = jieba.cut(msg) tokens = [token for token in list(tokens) if token not in self.stopwords] if len(tokens) >= 5: writer.write(' '.join(tokens) + '\n') index += 1
def expand_entry(): ''' 扩展词条的背景材料 ''' dbhelper = DBHelper() searcher = Searcher() entrys = dbhelper.select("SELECT entryid,name,category FROM entry WHERE background IS NULL") for entry in entrys: print entry[1], entry[2] searcher.expand(entry[0], entry[1], entry[2])