def syntax_filtration(self, source_path, target_path, fail_path, n_threads): source_db_op = DBOperation(source_path) target_db_op = DBOperation(target_path) target_db_op.init_db() fail_db_op = DBOperation(fail_path) fail_db_op.init_db() contents = source_db_op.query_all(['Content']) caseNum = len(contents) contents.clear() batch_size = 1000 batch_num = math.ceil(caseNum / batch_size) for x in range(0, batch_num): print("\n*****SynFlt_Batch" + str(x) + " start") if x < (batch_num - 1): contents = source_db_op.query_part(['Content'], (x * batch_size + 1), (x + 1) * batch_size) else: contents = source_db_op.query_part(['Content'], (x * batch_size), caseNum) processor = SyntaxFiltration(contents, target_path, fail_path, n_threads) processor.execute()
def db2list(db_path: str) -> typing.List[str]: """从单个数据库文件中读取数据 """ source_db_op = DBOperation(db_path) contents = source_db_op.query_all(['Content']) source_db_op.finalize() contents = [i[0].strip() for i in contents] return contents
def readFunctions(self): db_path = self.hparams.corpus_db target_db = DBOperation(db_path) contents = target_db.query_all(['Content']) return contents