예제 #1
0
    def dump(self):
        self.make_output_dir()

        database_busy_file = self.settings.get('database_busy_file')

        if database_busy_file:
            with open(database_busy_file, 'w'):
                pass

        self._drain_to_working_set()

        if database_busy_file:
            os.remove(database_busy_file)

        self._feed_input_sorters()

        with new_session() as session:
            for project_id, sorter in self.project_result_sorters.items():
                project = session.query(Project).filter_by(
                    name=project_id).first()

                if self.settings['include_settings']:
                    self.dump_project_settings(project)

                self.dump_project(project, sorter)

                if self.settings['zip']:
                    self.zip_project(project)

        os.remove(self.working_set_filename)
예제 #2
0
    def dump(self):
        self.make_output_dir()

        database_busy_file = self.settings.get("database_busy_file")

        if database_busy_file:
            with open(database_busy_file, "w"):
                pass

        self._drain_to_working_set()

        if database_busy_file:
            os.remove(database_busy_file)

        self._feed_input_sorters()

        with new_session() as session:
            for project_id, sorter in self.project_result_sorters.items():
                project = session.query(Project).filter_by(name=project_id).first()

                if self.settings["include_settings"]:
                    self.dump_project_settings(project)

                self.dump_project(project, sorter)

                if self.settings["zip"]:
                    self.zip_project(project)

        os.remove(self.working_set_filename)
예제 #3
0
    def recover(self):
        logger.info('Recovering from %s', self.args.working_set_file)
        with open(self.args.working_set_file, 'r') as file, \
                new_session() as session:
            query = insert(Result)
            values = []

            for line_num, line in enumerate(file):
                doc = pickle.loads(base64.b64decode(line))

                values.append({
                    'project_id': doc['project_id'],
                    'shortcode': doc['shortcode'],
                    'url': doc['url'],
                    'encoding': doc['encoding'],
                    'datetime': doc['datetime'],
                })

                if line_num % 10000 == 0:
                    logger.info('Recover progress: %d', line_num)
                    session.execute(query, values)
                    session.commit()
                    values = []

            logger.info('Finishing up...')
            session.execute(query, values)
            session.commit()

        logger.info('Done!')
예제 #4
0
    def generate_mock(self):
        with new_session() as session:
            items = []

            for i in range(self.args.count):
                if i % 100 == 0:
                    print(i)

                if self.args.projects == 1:
                    project_id = "test"
                else:
                    project_id = "test_{}".format(random.randint(1, self.args.projects))

                items.append(
                    {
                        "project_id": project_id,
                        "shortcode": self.generate_shortcode(),
                        "url": self.generate_url(),
                        "encoding": "ascii",
                        "datetime": datetime.datetime.utcnow(),
                    }
                )

            print("Running insertion")
            session.execute(insert(Result), items)
    def recover(self):
        logger.info('Recovering from %s', self.args.working_set_file)
        with open(self.args.working_set_file, 'r') as file, \
                new_session() as session:
            query = insert(Result)
            values = []

            for line_num, line in enumerate(file):
                doc = pickle.loads(base64.b64decode(line))

                values.append({
                    'project_id': doc['project_id'],
                    'shortcode': doc['shortcode'],
                    'url': doc['url'],
                    'encoding': doc['encoding'],
                    'datetime': doc['datetime'],
                })

                if line_num % 10000 == 0:
                    logger.info('Recover progress: %d', line_num)
                    session.execute(query, values)
                    session.commit()
                    values = []

            logger.info('Finishing up...')
            session.execute(query, values)
            session.commit()

        logger.info('Done!')
예제 #6
0
    def generate_mock(self):
        with new_session() as session:
            for project_num in range(1, self.args.count + 1):
                project_id = 'test_{}'.format(project_num)

                project = Project(name=project_id)

                print('Running insertion')
                session.add(project)
예제 #7
0
    def dump(self):
        self.make_output_dir()

        with new_session() as session:
            for project in session.query(Project):
                if self.settings['include_settings']:
                    self.dump_project_settings(project)

                self.dump_project(project, session)

                if self.settings['zip']:
                    self.zip_project(project)
예제 #8
0
    def generate_mock(self):
        with new_session() as session:
            for project_num in range(1, self.args.count + 1):
                project_id = "test_{}".format(project_num)

                project = Project(name=project_id)

                if project_num == 2:
                    project.url_template = "http://example.com/{shortcode}/slash/"

                print("Running insertion")
                session.add(project)
예제 #9
0
    def generate_mock(self):
        with new_session() as session:
            for project_num in range(1, self.args.count + 1):
                project_id = 'test_{}'.format(project_num)

                project = Project(name=project_id)

                if project_num == 2:
                    project.url_template = 'http://example.com/{shortcode}/slash/'

                print('Running insertion')
                session.add(project)
예제 #10
0
    def generate_mock(self):
        with new_session() as session:
            items = []

            for i in range(self.args.count):
                if i % 100 == 0:
                    print(i)

                if self.args.projects == 1:
                    project_id = 'test'
                else:
                    project_id = 'test_{}'.format(random.randint(1, self.args.projects))

                items.append({
                    'project_id': project_id,
                    'shortcode': self.generate_shortcode(),
                    'url': self.generate_url(),
                    'encoding': 'ascii',
                    'datetime': datetime.datetime.utcnow()
                })

            print('Running insertion')
            session.execute(insert(Result), items)
예제 #11
0
    def recover(self):
        logger.info('Recovering from %s', self.args.working_set_file)

        with gzip.open(self.args.working_set_file, 'rb') as file, \
                new_session() as session:
            query = insert(Result)
            values = []
            line_num = 0

            while True:
                doc = pickle.load(file)

                if doc == 'eof':
                    break

                values.append({
                    'project_id': doc['project_id'],
                    'shortcode': doc['shortcode'],
                    'url': doc['url'],
                    'encoding': doc['encoding'],
                    'datetime': doc['datetime'],
                })

                if line_num % 10000 == 0:
                    logger.info('Recover progress: %d', line_num)
                    session.execute(query, values)
                    session.commit()
                    values = []

                line_num += 1

            logger.info('Finishing up...')
            session.execute(query, values)
            session.commit()

        logger.info('Done!')
예제 #12
0
    def generate_mock(self):
        with new_session() as session:
            items = []

            for i in range(self.args.count):
                if i % 100 == 0:
                    print(i)

                if self.args.projects == 1:
                    project_id = 'test'
                else:
                    project_id = 'test_{}'.format(
                        random.randint(1, self.args.projects))

                items.append({
                    'project_id': project_id,
                    'shortcode': self.generate_shortcode(),
                    'url': self.generate_url(),
                    'encoding': 'ascii',
                    'datetime': datetime.datetime.utcnow()
                })

            print('Running insertion')
            session.execute(insert(Result), items)
예제 #13
0
    def _drain_to_working_set(self, size=1000):
        logger.info('Draining to working set %s', self.working_set_filename)

        assert not os.path.exists(self.working_set_filename)

        with new_session() as session:
            query = session.query(Result)

            if self.after:
                query = query.filter(Result.datetime > self.after)

            with open(self.working_set_filename, 'wb') as work_file:
                last_id = -1
                num_results = 0
                running = True

                while running:
                    # Optimized for SQLite scrolling window
                    rows = query.filter(Result.id > last_id).limit(size).all()

                    if not rows:
                        break

                    delete_ids = []

                    for result in rows:
                        line = base64.b64encode(
                            pickle.dumps({
                                'id': result.id,
                                'project_id': result.project_id,
                                'shortcode': result.shortcode,
                                'url': result.url,
                                'encoding': result.encoding,
                                'datetime': result.datetime,
                            }))
                        work_file.write(line)
                        work_file.write(b'\n')

                        num_results += 1
                        self.items_count += 1

                        delete_ids.append(result.id)

                        if num_results % 10000 == 0:
                            logger.info('Drain progress: %d', num_results)

                        if num_results % 100000 == 0:
                            # Risky, but need to do this since WAL
                            # performance is low on large transactions
                            logger.info(
                                "Checkpoint. (Don't delete stray files if program crashes!)"
                            )
                            work_file.flush()
                            session.commit()

                        if self.max_items and num_results >= self.max_items:
                            logger.info('Reached max items %d.',
                                        self.max_items)
                            running = False
                            break

                    if self.settings['delete']:
                        delete_query = delete(Result).where(
                            Result.id == bindparam('id'))
                        session.execute(delete_query, [{
                            'id': result_id
                        } for result_id in delete_ids])
예제 #14
0
    def _drain_to_working_set(self, size=1000):
        logger.info("Draining to working set %s", self.working_set_filename)

        assert not os.path.exists(self.working_set_filename)

        with new_session() as session:
            query = session.query(Result)

            if self.after:
                query = query.filter(Result.datetime > self.after)

            with open(self.working_set_filename, "wb") as work_file:
                last_id = -1
                num_results = 0
                running = True

                while running:
                    # Optimized for SQLite scrolling window
                    rows = query.filter(Result.id > last_id).limit(size).all()

                    if not rows:
                        break

                    delete_ids = []

                    for result in rows:
                        line = base64.b64encode(
                            pickle.dumps(
                                {
                                    "id": result.id,
                                    "project_id": result.project_id,
                                    "shortcode": result.shortcode,
                                    "url": result.url,
                                    "encoding": result.encoding,
                                    "datetime": result.datetime,
                                }
                            )
                        )
                        work_file.write(line)
                        work_file.write(b"\n")

                        num_results += 1
                        self.items_count += 1

                        delete_ids.append(result.id)

                        if num_results % 10000 == 0:
                            logger.info("Drain progress: %d", num_results)

                        if num_results % 100000 == 0:
                            # Risky, but need to do this since WAL
                            # performance is low on large transactions
                            logger.info("Checkpoint. (Don't delete stray files if program crashes!)")
                            work_file.flush()
                            session.commit()

                        if self.max_items and num_results >= self.max_items:
                            logger.info("Reached max items %d.", self.max_items)
                            running = False
                            break

                    if self.settings["delete"]:
                        delete_query = delete(Result).where(Result.id == bindparam("id"))
                        session.execute(delete_query, [{"id": result_id} for result_id in delete_ids])