Exemplo n.º 1
0
    def scrape_service(self, service_id: int, Scraper: Type[BaseScraper],
                       manga_info: Collection[MangaServiceInfo]):
        with self.conn() as conn:
            with conn:
                scraper = Scraper(conn, DbUtil(conn))
                rng = random.Random()
                manga_ids = set()
                errors = 0

                idx = 0
                for info in manga_info:
                    title_id = info['title_id']
                    manga_id = info['manga_id']
                    feed_url = info['feed_url']
                    logger.info(f'Updating {title_id} on service {service_id}')
                    try:
                        if res := scraper.scrape_series(
                                title_id, service_id, manga_id,
                                feed_url) is True:
                            manga_ids.add(manga_id)
                        elif res is None:
                            errors += 1
                            logger.error(
                                f'Failed to scrape series {title_id} {manga_id}'
                            )
                    except psycopg2.Error:
                        conn.rollback()
                        logger.exception(
                            f'Database error while updating manga {title_id} on service {service_id}'
                        )
                        scraper.dbutil.update_manga_next_update(
                            service_id, manga_id, scraper.next_update())
                        errors += 1
                    except:
Exemplo n.º 2
0
    def setUpClass(cls) -> None:
        super(KireiCakeTest, cls).setUpClass()
        with open(test_feed, 'r', encoding='utf-8') as f:
            cls.test_data = f.read()

        cls.group_id = DbUtil(cls._conn).get_or_create_group(
            KireiCake.NAME).group_id
        for c in correct_entries.values():
            c.group_id = cls.group_id
Exemplo n.º 3
0
    def do_scheduled_runs(self) -> List[int]:
        with self.conn() as conn:
            dbutil = DbUtil(conn)
            delete = []
            manga_ids = []
            service_counter: Counter = Counter()

            disabled_services = set(
                map(attrgetter('service_id'),
                    filter(attrgetter('disabled'), dbutil.get_services())))

            for sr in dbutil.get_scheduled_runs():
                manga_id = sr.manga_id
                service_id = sr.service_id
                title_id = sr.title_id
                Service = SCRAPERS_ID[service_id]

                if not Service.CONFIG.scheduled_runs_enabled:
                    logger.warning(
                        f'Tried to schedule run for service {Service.__name__} when it does not support scheduled runs.'
                    )
                    delete.append((manga_id, service_id))
                    continue

                if service_id in disabled_services:
                    logger.warning(
                        f'Tried to schedule run for service {Service.__name__} when it is disabled.'
                    )
                    delete.append((manga_id, service_id))
                    continue

                if service_counter.get(
                        service_id, 0) >= Service.CONFIG.scheduled_run_limit:
                    continue

                service_counter.update((service_id, ))

                if not title_id:
                    logger.error(
                        f'Manga {manga_id} on service {service_id} scheduled but not found from manga service'
                    )
                    delete.append((manga_id, service_id))
                    continue

                self.force_run(service_id, manga_id)
                delete.append((manga_id, service_id))
                manga_ids.append(manga_id)

            dbutil.delete_scheduled_runs(delete)
            dbutil.update_scheduled_run_disabled(list(service_counter.keys()))

            return manga_ids
Exemplo n.º 4
0
    def __init__(self, conn, dbutil: Optional['DbUtil'] = None):
        if self.CONFIG is NotImplemented:
            raise NotImplementedError(
                f'Service config value not set for {type(self).__name__}')

        self._conn = conn
        if dbutil is None:
            from src.utils.dbutils import DbUtil
            self._dbutil = DbUtil(conn)
        else:
            self._dbutil = dbutil
Exemplo n.º 5
0
    def __init__(self):
        config = {
            'db_host': os.environ['DB_HOST'],
            'db': os.environ['DB_NAME'],
            'db_user': os.environ['DB_USER'],
            'db_pass': os.environ['DB_PASSWORD'],
            'db_port': os.environ['DB_PORT']
        }

        self.pool = ThreadedConnectionPool(1,
                                           self.MAX_POOLS,
                                           host=config['db_host'],
                                           port=config['db_port'],
                                           user=config['db_user'],
                                           password=config['db_pass'],
                                           dbname=config['db'],
                                           cursor_factory=LoggingDictCursor)
        self.thread_pool = ThreadPoolExecutor(max_workers=self.MAX_POOLS - 1)

        with self.conn() as conn:
            inject_service_values(DbUtil(conn))
Exemplo n.º 6
0
def setup_tests(request):
    # No need to sleep in tests
    time.sleep = lambda *_: None

    requires_database = False

    # Initiate database only if it is required for the tests we are running
    for item in request.node.items:
        if issubclass(item.cls, BaseTestClasses.DatabaseTestCase):
            requires_database = True
            break

    if not requires_database:
        return

    print('setting up')
    start_db()
    conn = create_db(None if not Postgresql else Postgresql.cache)
    dbutil = DbUtil(conn)
    inject_service_values(dbutil)

    DummyScraper(conn, dbutil).add_service()
    DummyScraper2(conn, dbutil).add_service()

    from src.scrapers import SCRAPERS, SCRAPERS_ID
    SCRAPERS[DummyScraper.URL] = DummyScraper
    SCRAPERS[DummyScraper2.URL] = DummyScraper2

    SCRAPERS_ID[DummyScraper.ID] = DummyScraper
    SCRAPERS_ID[DummyScraper2.ID] = DummyScraper2

    conn.close()

    def fin():
        print('\nDeleting test db')
        teardown_db()

    request.addfinalizer(fin)
Exemplo n.º 7
0
    def setUpClass(cls) -> None:
        super(MangadexTests, cls).setUpClass()

        dbutil = DbUtil(cls._conn)
        dbutil.add_authors([
            AuthorPartial(name='Im Dal-Young', mangadex_id='d21a9418-817a-43e5-a4d2-bf1e7391d7ec')
        ])
        dbutil.add_manga_service(MangaService(
            service_id=MangaDex.ID,
            title_id='6fe9349a-8eeb-42cf-bead-e6f40b2653de',
            title="I Was Born as the Demon Lord's Daughter"
        ), add_manga=True)

        api_path = os.path.join(os.path.dirname(__file__), 'api_data')

        with open(os.path.join(api_path, 'chapters.json'), 'r', encoding='utf-8') as f:
            cls.chapters_data = json.load(f)

        with open(os.path.join(api_path, 'manga.json'), 'r', encoding='utf-8') as f:
            cls.manga_data = json.load(f)
Exemplo n.º 8
0
    def run_once(self):
        with self.conn() as conn:
            futures = []
            sql = '''
                SELECT ms.service_id, s.url, array_agg(json_build_object('title_id', ms.title_id, 'manga_id', ms.manga_id, 'feed_url', ms.feed_url)) as manga_info
                FROM manga_service ms
                INNER JOIN services s ON s.service_id=ms.service_id
                WHERE NOT (s.disabled OR ms.disabled) AND (s.disabled_until IS NULL OR s.disabled_until < NOW()) AND (ms.next_update IS NULL OR ms.next_update < NOW())
                GROUP BY ms.service_id, s.url
            '''

            with conn.cursor() as cursor:
                cursor.execute(sql)

                manga_ids = set()
                for row in cursor:
                    batch_size = random.randint(3, 6)
                    Scraper = SCRAPERS.get(row['url'])
                    if not Scraper:
                        logger.error(f'Failed to find scraper for {row}')
                        continue

                    futures.append(
                        self.thread_pool.submit(
                            self.scrape_service, row['service_id'], Scraper,
                            row['manga_info'][:batch_size]))

            sql = """SELECT s.service_id, sw.feed_url, s.url
                     FROM service_whole sw INNER JOIN services s on sw.service_id = s.service_id
                     WHERE NOT s.disabled AND (sw.next_update IS NULL OR sw.next_update < NOW())"""

            services = []
            with conn.cursor() as cursor:
                cursor.execute(sql)
                for row in cursor:
                    services.append(row)

            for service in services:
                Scraper = SCRAPERS.get(service['url'])
                if not Scraper:
                    logger.error(f'Failed to find scraper for {service}')
                    continue

                scraper = Scraper(conn, DbUtil(conn))
                logger.info(f'Updating service {service[2]}')

                with conn:
                    try:
                        retval = scraper.scrape_service(
                            service[0], service[1], None)
                    except psycopg2.Error:
                        logger.exception(
                            f'Database error while scraping {service[1]}')
                        scraper.set_checked(service[0])
                        continue
                    except:
                        logger.exception(
                            f'Failed to scrape service {service[1]}')
                        scraper.set_checked(service[0])
                        continue

                scraper.set_checked(service[0])
                if retval:
                    manga_ids.update(retval)

            conn.commit()

            retval = self.do_scheduled_runs()
            manga_ids.update(retval)

            for r in futures:
                res = r.result()
                if isinstance(res, set):
                    manga_ids.update(res)

            with conn:
                if manga_ids:
                    logger.debug(
                        f"Updating interval of {len(manga_ids)} manga")
                    dbutil = DbUtil(conn)
                    with conn.cursor() as cursor:
                        dbutil.update_latest_release(list(manga_ids),
                                                     cur=cursor)
                        for manga_id in manga_ids:
                            dbutil.update_chapter_interval(manga_id,
                                                           cur=cursor)

            sql = '''
            SELECT MIN(t.update) FROM (
                SELECT
                   LEAST(
                       GREATEST(MIN(ms.next_update), s.disabled_until),
                       (
                           SELECT MIN(GREATEST(sw.next_update, s2.disabled_until))
                           FROM service_whole sw 
                               INNER JOIN services s2 ON s2.service_id = sw.service_id 
                           WHERE s2.disabled=FALSE
                       )
                   ) as update
                FROM manga_service ms
                INNER JOIN services s ON s.service_id = ms.service_id
                WHERE s.disabled=FALSE AND ms.disabled=FALSE
                GROUP BY s.service_id, ms.service_id
            ) as t
            '''
            with conn.cursor() as cursor:
                cursor.execute(sql)
                retval = cursor.fetchone()
                if not retval:
                    return datetime.utcnow() + timedelta(hours=1)
                return retval[0]
Exemplo n.º 9
0
    def force_run(self,
                  service_id: int,
                  manga_id: int = None) -> Optional[Union[bool, Set[int]]]:
        if service_id not in SCRAPERS_ID:
            logger.warning(f'No service found with id {service_id}')
            return None

        with self.conn() as conn:
            if manga_id is not None:
                sql = '''
                    SELECT ms.service_id, s.url, ms.title_id, ms.manga_id, ms.feed_url, sw.feed_url as service_feed_url
                    FROM manga_service ms
                    INNER JOIN services s ON s.service_id=ms.service_id
                    LEFT JOIN service_whole sw ON s.service_id = sw.service_id
                    WHERE s.service_id=%s AND ms.manga_id=%s
                '''
                with conn.cursor() as cursor:
                    cursor.execute(sql, (service_id, manga_id))
                    row = cursor.fetchone()

                if not row:
                    logger.debug(
                        f'Failed to find manga {manga_id} from service {service_id}'
                    )
                    return None

                Scraper = SCRAPERS.get(row['url'])
                if not Scraper:
                    logger.error(f'Failed to find scraper for {row}')
                    return None

                scraper = Scraper(conn)

                title_id: str = row['title_id']
                manga_id = cast(int, row['manga_id'])
                # Feed url is the feed url of the manga or if that's not defined
                # the feed url of the service. Manga url always takes priority
                feed_url: str = row['feed_url'] or row['service_feed_url']

                logger.info(
                    f'Force updating {title_id} on service {service_id}')
                with conn:
                    try:
                        retval = scraper.scrape_series(title_id,
                                                       service_id,
                                                       manga_id,
                                                       feed_url=feed_url)
                    except psycopg2.Error:
                        logger.exception(
                            f'Database error while scraping {service_id} {scraper.NAME}: {title_id}'
                        )
                        return None
                    except:
                        logger.exception(
                            f'Failed to scrape service {service_id}')
                        return None

                    if retval is None:
                        logger.error(f'Failed to scrape series {row}')
                        return None

                return retval

            else:
                sql = """SELECT s.service_id, sw.feed_url, s.url
                         FROM service_whole sw INNER JOIN services s on sw.service_id = s.service_id
                         WHERE s.service_id=%s"""

                manga_ids: Set[int] = set()
                with conn.cursor() as cursor:
                    cursor.execute(sql, (service_id, ))
                    row = cursor.fetchone()
                    if not row:
                        logger.debug(f'Failed to find service {service_id}')
                        return None

                Scraper = SCRAPERS.get(row['url'])
                if not Scraper:
                    logger.error(f'Failed to find scraper for {row}')
                    return None

                scraper = Scraper(conn, DbUtil(conn))
                logger.info(f'Updating service {row["url"]}')
                with conn:
                    updated_ids = scraper.scrape_service(
                        row['service_id'], row['feed_url'], None)
                if updated_ids:
                    manga_ids.update(updated_ids)

                return manga_ids
Exemplo n.º 10
0
 def setUp(self) -> None:
     self.dbutil = DbUtil(self._conn)
Exemplo n.º 11
0
    class DatabaseTestCase(unittest.TestCase):
        _conn: Connection = NotImplemented
        _generator: 'BaseTestClasses.TitleIdGenerator' = NotImplemented

        @classmethod
        def setUpClass(cls) -> None:
            cls._conn = get_conn()
            # Integers are retained during tests but they reset to the default value
            # for some reason. Circumvent this by using a class.
            cls._generator = BaseTestClasses.TitleIdGenerator()

        @property
        def conn(self) -> Connection:
            return self._conn

        def setUp(self) -> None:
            self.dbutil = DbUtil(self._conn)

        @classmethod
        def tearDownClass(cls) -> None:
            cls._conn.close()

        def get_str_id(self) -> str:
            return self._generator.generate(type(self).__name__)

        def get_manga_service(self, scraper: Type['BaseScraper'] = DummyScraper) -> MangaService:
            id_ = self.get_str_id()
            return MangaService(service_id=scraper.ID, title_id=id_,
                                title=f'{id_}_manga')

        def create_manga_service(self, scraper: Type['BaseScraper'] = DummyScraper) -> MangaService:
            id_ = self.get_str_id()
            ms = MangaService(service_id=scraper.ID, title_id=id_,
                              title=f'{id_}_manga')
            return self.dbutil.add_manga_service(ms, add_manga=True)

        def assertChapterEqualsRow(self, chapter: 'Chapter', row: DictRow) -> None:
            pairs = [
                ('chapter_title', 'title'),
                ('chapter_number', 'chapter_number'),
                ('decimal', 'chapter_decimal'),
                ('release_date', 'release_date',
                 lambda: (getattr(chapter, 'release_date'), date_fix(row['release_date']))
                 ),
                ('chapter_identifier', 'chapter_identifier'),
                ('group', 'group')
            ]

            for val in pairs:
                chapter_attr, row_attr = val[:2]
                if len(val) == 3:
                    get_vals = val[2]
                else:
                    def get_vals():
                        return getattr(chapter, chapter_attr), row[row_attr]

                c_val, r_val = get_vals()  # type: ignore[operator]
                if c_val != r_val:
                    self.fail(
                        'Chapter from database does not equal model\n'
                        f'{chapter_attr} != {row_attr}\n'
                        f'{c_val} != {row[row_attr]}'
                    )

        def assertDatesEqual(self, date1: datetime, date2: datetime):
            if date_fix(date1) != date_fix(date2):
                self.fail(f'Date {date1} does not match date {date2}')

        def assertDatesNotEqual(self, date1: datetime, date2: datetime):
            if date_fix(date1) == date_fix(date2):
                self.fail(f'Date {date1} equals date {date2}')

        def assertDateGreater(self, date1: datetime, date2: datetime):
            if date_fix(date1) <= date_fix(date2):
                self.fail(f'Date {date1} is earlier or equal to {date2}')

        def assertDateLess(self, date1: datetime, date2: datetime):
            if date_fix(date1) >= date_fix(date2):
                self.fail(f'Date {date1} is later or equal to {date2}')

        def assertDatesAlmostEqual(self, date1: datetime, date2: datetime,
                                   delta: timedelta = timedelta(seconds=1),
                                   msg: str = None):
            date1 = date_fix(date1)
            date2 = date_fix(date2)

            self.assertAlmostEqual(date1, date2, delta=delta, msg=msg)

        def assertMangaServiceExists(self, title_id: str, service_id: int):
            sql = 'SELECT 1 FROM manga_service WHERE service_id=%s AND title_id=%s'
            with self.conn.cursor() as cur:
                cur.execute(sql, (service_id, title_id))
                row = cur.fetchone()

            self.assertIsNotNone(row, msg=f'Manga {title_id} not found')

        def assertMangaWithTitleFound(self, title: str):
            self.assertIsNotNone(
                self.dbutil.find_manga_by_title(title),
                msg=f'Manga with title {title} not found when expected to be found'
            )

        @staticmethod
        def utcnow() -> datetime:
            """
            Return utc time with psycopg2 timezone
            """
            return datetime.utcnow().replace(tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))