def _collect_meta_from_db(self, db_file): with connect_sqlite(db_file) as conn: cursor = conn.cursor() sql = "SELECT tid, html FROM raw_html;" cursor.execute(sql) for x in cursor.fetchall(): tid, html = x self.tasks.append((tid, html)) self.logger.info("collected {} tasks".format(len(self.tasks)))
def _collect_meta_from_db(self, db_file): with connect_sqlite(db_file) as conn: cursor = conn.cursor() sql = "SELECT url, response_body FROM subreddit;" cursor.execute(sql) for x in cursor.fetchall(): url, html = x self.tasks.append((url, html)) self.logger.info("collected {} tasks".format(len(self.tasks)))
def _collect_meta_from_db(self, dbfile): with connect_sqlite(dbfile) as conn: cursor = conn.cursor() sql = "select response_body, url from cnet where response_url like 'https://www.cnet.com/forums/discussions/%';" try: cursor.execute(sql) for x in cursor.fetchall(): self.tasks.append(x) except Exception: traceback.print_exc() self.all_loaded = True self.logger.info("collected {} tasks".format(len(self.tasks)))
def _collect_meta_from_db(self, db_file): total_tasks = 0 with connect_sqlite(db_file) as conn: cursor = conn.cursor() sql = "SELECT tid, html FROM raw_html;" cursor.execute(sql) for x in cursor.fetchall(): tid, html = x total_tasks += 1 self.tasks.put(dict(tid=tid, html=html)) self.all_loaded.value = True self.total_tasks = total_tasks self.logger.info("collected {} tasks".format(self.total_tasks))
def _collect_meta_from_db(self, dbfile): chunk_size = 1000 with connect_sqlite(dbfile) as conn: cursor = conn.cursor() sql = "select response_body, url from {} where response_url not like 'https://www.reddit.com/r/%/?count=%';".format( self.reddit_type) try: cursor.execute(sql) batch = cursor.fetchmany(chunk_size) while batch: self.total_tasks += len(batch) for x in batch: self.tasks.put(x) batch = cursor.fetchmany(chunk_size) except Exception: traceback.print_exc() self.all_loaded = True self.logger.info("collected {} tasks".format(len(self.tasks)))