def dump(self): self.make_output_dir() database_busy_file = self.settings.get('database_busy_file') if database_busy_file: with open(database_busy_file, 'w'): pass self._drain_to_working_set() if database_busy_file: os.remove(database_busy_file) self._feed_input_sorters() with new_session() as session: for project_id, sorter in self.project_result_sorters.items(): project = session.query(Project).filter_by( name=project_id).first() if self.settings['include_settings']: self.dump_project_settings(project) self.dump_project(project, sorter) if self.settings['zip']: self.zip_project(project) os.remove(self.working_set_filename)
def dump(self): self.make_output_dir() database_busy_file = self.settings.get("database_busy_file") if database_busy_file: with open(database_busy_file, "w"): pass self._drain_to_working_set() if database_busy_file: os.remove(database_busy_file) self._feed_input_sorters() with new_session() as session: for project_id, sorter in self.project_result_sorters.items(): project = session.query(Project).filter_by(name=project_id).first() if self.settings["include_settings"]: self.dump_project_settings(project) self.dump_project(project, sorter) if self.settings["zip"]: self.zip_project(project) os.remove(self.working_set_filename)
def recover(self): logger.info('Recovering from %s', self.args.working_set_file) with open(self.args.working_set_file, 'r') as file, \ new_session() as session: query = insert(Result) values = [] for line_num, line in enumerate(file): doc = pickle.loads(base64.b64decode(line)) values.append({ 'project_id': doc['project_id'], 'shortcode': doc['shortcode'], 'url': doc['url'], 'encoding': doc['encoding'], 'datetime': doc['datetime'], }) if line_num % 10000 == 0: logger.info('Recover progress: %d', line_num) session.execute(query, values) session.commit() values = [] logger.info('Finishing up...') session.execute(query, values) session.commit() logger.info('Done!')
def generate_mock(self): with new_session() as session: items = [] for i in range(self.args.count): if i % 100 == 0: print(i) if self.args.projects == 1: project_id = "test" else: project_id = "test_{}".format(random.randint(1, self.args.projects)) items.append( { "project_id": project_id, "shortcode": self.generate_shortcode(), "url": self.generate_url(), "encoding": "ascii", "datetime": datetime.datetime.utcnow(), } ) print("Running insertion") session.execute(insert(Result), items)
def generate_mock(self): with new_session() as session: for project_num in range(1, self.args.count + 1): project_id = 'test_{}'.format(project_num) project = Project(name=project_id) print('Running insertion') session.add(project)
def dump(self): self.make_output_dir() with new_session() as session: for project in session.query(Project): if self.settings['include_settings']: self.dump_project_settings(project) self.dump_project(project, session) if self.settings['zip']: self.zip_project(project)
def generate_mock(self): with new_session() as session: for project_num in range(1, self.args.count + 1): project_id = "test_{}".format(project_num) project = Project(name=project_id) if project_num == 2: project.url_template = "http://example.com/{shortcode}/slash/" print("Running insertion") session.add(project)
def generate_mock(self): with new_session() as session: for project_num in range(1, self.args.count + 1): project_id = 'test_{}'.format(project_num) project = Project(name=project_id) if project_num == 2: project.url_template = 'http://example.com/{shortcode}/slash/' print('Running insertion') session.add(project)
def generate_mock(self): with new_session() as session: items = [] for i in range(self.args.count): if i % 100 == 0: print(i) if self.args.projects == 1: project_id = 'test' else: project_id = 'test_{}'.format(random.randint(1, self.args.projects)) items.append({ 'project_id': project_id, 'shortcode': self.generate_shortcode(), 'url': self.generate_url(), 'encoding': 'ascii', 'datetime': datetime.datetime.utcnow() }) print('Running insertion') session.execute(insert(Result), items)
def recover(self): logger.info('Recovering from %s', self.args.working_set_file) with gzip.open(self.args.working_set_file, 'rb') as file, \ new_session() as session: query = insert(Result) values = [] line_num = 0 while True: doc = pickle.load(file) if doc == 'eof': break values.append({ 'project_id': doc['project_id'], 'shortcode': doc['shortcode'], 'url': doc['url'], 'encoding': doc['encoding'], 'datetime': doc['datetime'], }) if line_num % 10000 == 0: logger.info('Recover progress: %d', line_num) session.execute(query, values) session.commit() values = [] line_num += 1 logger.info('Finishing up...') session.execute(query, values) session.commit() logger.info('Done!')
def generate_mock(self): with new_session() as session: items = [] for i in range(self.args.count): if i % 100 == 0: print(i) if self.args.projects == 1: project_id = 'test' else: project_id = 'test_{}'.format( random.randint(1, self.args.projects)) items.append({ 'project_id': project_id, 'shortcode': self.generate_shortcode(), 'url': self.generate_url(), 'encoding': 'ascii', 'datetime': datetime.datetime.utcnow() }) print('Running insertion') session.execute(insert(Result), items)
def _drain_to_working_set(self, size=1000): logger.info('Draining to working set %s', self.working_set_filename) assert not os.path.exists(self.working_set_filename) with new_session() as session: query = session.query(Result) if self.after: query = query.filter(Result.datetime > self.after) with open(self.working_set_filename, 'wb') as work_file: last_id = -1 num_results = 0 running = True while running: # Optimized for SQLite scrolling window rows = query.filter(Result.id > last_id).limit(size).all() if not rows: break delete_ids = [] for result in rows: line = base64.b64encode( pickle.dumps({ 'id': result.id, 'project_id': result.project_id, 'shortcode': result.shortcode, 'url': result.url, 'encoding': result.encoding, 'datetime': result.datetime, })) work_file.write(line) work_file.write(b'\n') num_results += 1 self.items_count += 1 delete_ids.append(result.id) if num_results % 10000 == 0: logger.info('Drain progress: %d', num_results) if num_results % 100000 == 0: # Risky, but need to do this since WAL # performance is low on large transactions logger.info( "Checkpoint. (Don't delete stray files if program crashes!)" ) work_file.flush() session.commit() if self.max_items and num_results >= self.max_items: logger.info('Reached max items %d.', self.max_items) running = False break if self.settings['delete']: delete_query = delete(Result).where( Result.id == bindparam('id')) session.execute(delete_query, [{ 'id': result_id } for result_id in delete_ids])
def _drain_to_working_set(self, size=1000): logger.info("Draining to working set %s", self.working_set_filename) assert not os.path.exists(self.working_set_filename) with new_session() as session: query = session.query(Result) if self.after: query = query.filter(Result.datetime > self.after) with open(self.working_set_filename, "wb") as work_file: last_id = -1 num_results = 0 running = True while running: # Optimized for SQLite scrolling window rows = query.filter(Result.id > last_id).limit(size).all() if not rows: break delete_ids = [] for result in rows: line = base64.b64encode( pickle.dumps( { "id": result.id, "project_id": result.project_id, "shortcode": result.shortcode, "url": result.url, "encoding": result.encoding, "datetime": result.datetime, } ) ) work_file.write(line) work_file.write(b"\n") num_results += 1 self.items_count += 1 delete_ids.append(result.id) if num_results % 10000 == 0: logger.info("Drain progress: %d", num_results) if num_results % 100000 == 0: # Risky, but need to do this since WAL # performance is low on large transactions logger.info("Checkpoint. (Don't delete stray files if program crashes!)") work_file.flush() session.commit() if self.max_items and num_results >= self.max_items: logger.info("Reached max items %d.", self.max_items) running = False break if self.settings["delete"]: delete_query = delete(Result).where(Result.id == bindparam("id")) session.execute(delete_query, [{"id": result_id} for result_id in delete_ids])