def parse_all(fnames, reparse=False): """ 格式化为ts、tag、url、title、root_domain、domain、url_path :param reparse:是否重新全部解析 :return: """ sqldb = SQLite('data/secwiki.db') # 判断是否重新全部解析 if reparse: fnames = [] gen_file = glob.iglob(r'data/html/secwiki_*.html') sql = 'delete from `secwiki`' for gfile in gen_file: fnames.append(gfile) sqldb.execute(sql) if fnames is None: print('No new secwiki') return sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);' for fname in fnames: # 判断目标文件本地是否存在 m = re.search(r'secwiki_(\d+)\.html', fname) rname = m.group(1) rname = path('data/txt', 'secwiki_' + rname + '.txt') if not os.path.exists(path("data/txt")): os.mkdir(path("data/txt")) if os.path.exists(rname) and os.path.getsize(rname) > 0: continue # 待统一写入目标文件 rf = codecs.open(rname, mode='wb') # 读本地源文件并解析 with codecs.open(fname, 'rb') as f: all_content = {} #print(fname) for content in parse_single(f): if content: # 解析完写入目标文件 k = content[0] + content[2] all_content[k] = content line = "\t".join(content) rf.write(line.encode() + b'\r\n') # 批量存入sqlite3 if all_content: sqldb.executemany(sql, all_content.values()) rf.close()