Exemplo n.º 1
0
    def start_requests(self):
        """Generate start requests."""

        start_date = parse_date(self.settings.get("START_DATE"), tzinfo=timezone.utc)

        if not start_date:
            earliest_date = (
                parse_date(self.settings.get("EARLIEST_DATE"), tzinfo=timezone.utc)
                or now()
            )
            latest_date = (
                parse_date(self.settings.get("LATEST_DATE"), tzinfo=timezone.utc)
                or now()
            )
            start_date_ts = randint(earliest_date.timestamp(), latest_date.timestamp())
            start_date = datetime.fromtimestamp(start_date_ts, tz=timezone.utc)

        self.logger.info("Start date: %s", start_date)

        start_date_str = start_date.strftime(WEB_ARCHIVE_DATE_FORMAT)

        for start_url in self.start_urls:
            yield Request(
                url=start_url.format(date=start_date_str),
                callback=self.parse,
                priority=1,
            )
def _find_latest_ranking(
    path_dir: Path,
    glob: str = "*.csv",
    star_percentiles=Optional[Iterable[float]],
) -> tc.SFrame:
    path_dir = path_dir.resolve()
    LOGGER.info("Searching <%s> for latest ranking", path_dir)
    path_file = max(
        path_dir.glob(glob),
        key=lambda p: parse_date(p.stem, tzinfo=timezone.utc),
    )

    LOGGER.info("Loading ranking from <%s>", path_file)
    recommendations = tc.SFrame.read_csv(str(path_file))["rank", "bgg_id",
                                                         "score"]

    if star_percentiles:
        buckets = tuple(
            percentile_buckets(recommendations["score"], star_percentiles))
        recommendations["stars"] = [
            star_rating(score=score, buckets=buckets, low=1.0, high=5.0)
            for score in recommendations["score"]
        ]

    return recommendations
Exemplo n.º 3
0
def serialize_date(date: Any,
                   tzinfo: Optional[timezone] = None) -> Optional[str]:
    """seralize a date into ISO format if possible"""

    parsed = parse_date(date, tzinfo)
    return parsed.strftime("%Y-%m-%dT%T%z") if parsed else str(
        date) if date else None
Exemplo n.º 4
0
    def parse(self, response):
        """
        @url https://www.boardgamegeek.com/geeklist/30543/bgg-top-50-statistics-meta-list
        @returns items 0 0
        @returns requests 26
        """

        for next_page in response.xpath(
                "//a[contains(@title, 'page')]/@href").extract():
            yield response.follow(
                url=next_page,
                callback=self.parse,
            )

        scraped_at = now()

        for title in (response.xpath("/html/head/title/text()").extract() +
                      response.css("div.geeklist_title::text").extract()):
            match = TITLE_REGEX.match(title)
            published_at = (parse_date(match.group(2), tzinfo=timezone.utc)
                            if match else None)
            if published_at:
                break
        else:
            published_at = None

        for item in response.xpath("//*[@data-objecttype = 'listitem']"):
            result = self.parse_item(
                item=item,
                response=response,
                published_at=published_at,
                scraped_at=scraped_at,
            )
            if result:
                yield result
Exemplo n.º 5
0
def serialize_date(date: Any, tzinfo: Optional[timezone] = None) -> Optional[str]:
    """Seralize a date into ISO format if possible."""

    parsed = parse_date(date, tzinfo)
    return (
        parsed.isoformat(timespec="seconds") if parsed else str(date) if date else None
    )
Exemplo n.º 6
0
def main():
    """Command line entry point."""

    args = _parse_args()

    logging.basicConfig(
        stream=sys.stderr,
        level=logging.DEBUG if args.verbose > 0 else logging.INFO,
        format="%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s",
    )

    LOGGER.info(args)

    dont_run_before = parse_date(
        args.dont_run_before, tzinfo=timezone.utc
    ) or date_from_file(args.dont_run_before, tzinfo=timezone.utc)

    if dont_run_before:
        LOGGER.info("Don't run before %s", dont_run_before.isoformat())
        sleep_seconds = dont_run_before.timestamp() - now().timestamp()
        if sleep_seconds > 0:
            LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds)
            sleep(sleep_seconds)

    if args.interval and args.dont_run_before and not parse_date(args.dont_run_before):
        dont_run_before = now() + timedelta(seconds=args.interval)
        LOGGER.info(
            "Don't run next time before %s, writing tag to <%s>",
            dont_run_before.isoformat(),
            args.dont_run_before,
        )
        with open(args.dont_run_before, "w") as file_obj:
            file_obj.write(dont_run_before.isoformat())

    update_news(
        s3_src=f"s3://{args.src_bucket}/",
        path_feeds=args.feeds,
        path_merged=args.merged,
        path_split=args.split,
        s3_dst=f"s3://{args.dst_bucket}/",
        split_size=args.split_size,
        log_level="DEBUG"
        if args.verbose > 1
        else "INFO"
        if args.verbose > 0
        else "WARN",
    )
    def rankings(self, request, pk=None):
        """Find historical rankings of a game."""

        filters = {
            "game": pk,
            "ranking_type__in": clear_list(_extract_params(request, "ranking_type")),
            "date__gte": parse_date(
                request.query_params.get("date__gte"), tzinfo=timezone.utc
            ),
            "date__lte": parse_date(
                request.query_params.get("date__lte"), tzinfo=timezone.utc
            ),
        }
        filters = {k: v for k, v in filters.items() if v}
        queryset = Ranking.objects.filter(**filters)
        serializer = RankingSerializer(
            queryset, many=True, context=self.get_serializer_context()
        )
        return Response(serializer.data)
def model_updated_at(file_path=settings.MODEL_UPDATED_FILE):
    """latest model update"""
    try:
        with open(file_path) as file_obj:
            updated_at = file_obj.read()
        updated_at = normalize_space(updated_at)
        return parse_date(updated_at, tzinfo=timezone.utc)
    except Exception:
        pass
    return None
    def history(self, request):
        """History of the top rankings."""

        top = parse_int(request.query_params.get("top")) or 100
        ranking_type = request.query_params.get("ranking_type") or Ranking.BGG

        filters = {
            "ranking_type": ranking_type,
            "date__gte": parse_date(
                request.query_params.get("date__gte"), tzinfo=timezone.utc
            ),
            "date__lte": parse_date(
                request.query_params.get("date__lte"), tzinfo=timezone.utc
            ),
        }
        filters = {k: v for k, v in filters.items() if v}
        queryset = Ranking.objects.filter(**filters)

        last_date = queryset.filter(rank=1).dates("date", "day", order="ASC").last()
        games = [
            r.game
            for r in queryset.filter(date=last_date, rank__lte=top)
            .order_by("rank")
            .select_related("game")
        ]

        assert len(games) == top

        game_ids = frozenset(g.bgg_id for g in games)
        rankings = queryset.filter(game__in=game_ids).order_by("date")

        data = [
            {
                "game": self.get_serializer(game).data,
                "rankings": RankingSerializer(
                    rankings.filter(game=game.bgg_id),
                    many=True,
                    context=self.get_serializer_context(),
                ).data,
            }
            for game in games
        ]
        return Response(data)
Exemplo n.º 10
0
    def __init__(self, tag_file, date=None, seconds=None):
        date = parse_date(date, tzinfo=timezone.utc)
        seconds = parse_float(seconds)

        if not date and not seconds:
            raise NotConfigured

        self.tag_file = Path(tag_file).resolve()
        self.tag_file.parent.mkdir(parents=True, exist_ok=True)
        self.date = date
        self.seconds = seconds
def _rating_data(
    recommender_path=getattr(settings, "RECOMMENDER_PATH", None),
    pk_field="bgg_id",
    rankings_path=None,
    r_g_ranking_effective_date=getattr(settings, "R_G_RANKING_EFFECTIVE_DATE",
                                       None),
):
    recommender = load_recommender(recommender_path, "bgg")

    if not recommender:
        return {}

    r_g_ranking_effective_date = parse_date(
        r_g_ranking_effective_date,
        tzinfo=timezone.utc,
    )

    if (rankings_path and r_g_ranking_effective_date
            and now() >= r_g_ranking_effective_date):
        LOGGER.info(
            "Using new R.G ranking effective from %s",
            r_g_ranking_effective_date,
        )
        recommendations = _find_latest_ranking(
            path_dir=Path(rankings_path),
            star_percentiles=getattr(settings, "STAR_PERCENTILES", None),
        )
    else:
        recommendations = recommender.recommend(star_percentiles=getattr(
            settings, "STAR_PERCENTILES", None), )

    LOGGER.info("Loaded recommendations for %d games", len(recommendations))

    count = -1
    result = {}

    for count, game in enumerate(recommendations):
        if count and count % 1000 == 0:
            LOGGER.info("processed %d items so far", count)

        pkey = game.get(pk_field)
        if pkey is None:
            continue

        result[pkey] = {
            "rec_rank": game.get("rank"),
            "rec_rating": game.get("score"),
            "rec_stars": game.get("stars"),
        }

    LOGGER.info("processed %d items in total", count)

    return result
Exemplo n.º 12
0
def _parse_date(date, tzinfo=timezone.utc, format_str=WEB_ARCHIVE_DATE_FORMAT):
    try:
        date = datetime.strptime(date, format_str)
        return date.replace(tzinfo=tzinfo)
    except Exception:
        pass

    try:
        return parse_date(date, tzinfo, format_str)
    except Exception:
        pass

    return None
    def _local_requests(self, path_dir="."):
        path_dir = Path(path_dir).resolve()

        for path_file in path_dir.iterdir():
            if not path_file.is_file():
                continue

            self.logger.info("Processing <%s>", path_file)

            date = parse_date(path_file.stem, tzinfo=timezone.utc)

            yield Request(url=path_file.as_uri(),
                          callback=self.parse,
                          meta={"published_at": date})
Exemplo n.º 14
0
    def from_crawler(cls, crawler):
        """ init from crawler """

        tag_file = crawler.settings.get("DONT_RUN_BEFORE_FILE")
        date = parse_date(
            crawler.settings.get("DONT_RUN_BEFORE_DATE"), tzinfo=timezone.utc
        )
        seconds = crawler.settings.getfloat("DONT_RUN_BEFORE_SEC")

        if not tag_file or not (seconds or date):
            raise NotConfigured

        obj = cls(tag_file, date, seconds)

        crawler.signals.connect(obj._spider_opened, signals.spider_opened)

        return obj
Exemplo n.º 15
0
def date_from_file(
    path: Union[bytes, str, os.PathLike, None],
    tzinfo: Optional[timezone] = None,
    format_str: Optional[str] = None,
) -> Optional[datetime]:
    """Parse a date from a file."""

    if not path:
        return None

    path = Path(path).resolve()
    LOGGER.info("Reading date from path <%s>", path)

    try:
        with path.open() as file_obj:
            date = normalize_space(next(file_obj, None))
    except Exception:
        date = None

    return parse_date(date=date, tzinfo=tzinfo, format_str=format_str)
Exemplo n.º 16
0
def main():
    """Command line entry point."""

    settings = get_project_settings()
    configure_logging(settings)

    args = _parse_args()
    LOGGER.info(args)

    base_dir = Path(settings["BASE_DIR"]).resolve()
    cache_dir = base_dir / ".scrapy" / "httpcache"
    feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds"
    feeds_dir = feeds_dir.resolve()
    feeds_dir_scraper = (
        feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider
    )
    file_tag = normalize_space(args.file_tag)
    out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl"

    LOGGER.info("Output file will be <%s>", out_file)

    from_settings = job_dir_from_settings(settings)
    job_dir = (
        Path(args.job_dir)
        if args.job_dir
        else Path(from_settings)
        if from_settings
        else base_dir / "jobs" / args.spider
    )
    job_dir = job_dir.resolve()

    cache_dir.mkdir(parents=True, exist_ok=True)
    feeds_dir_scraper.mkdir(parents=True, exist_ok=True)
    job_dir.mkdir(parents=True, exist_ok=True)

    dont_run_before_file = job_dir / ".dont_run_before"
    dont_run_before = parse_date(
        args.dont_run_before, tzinfo=timezone.utc
    ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc)

    if dont_run_before:
        LOGGER.info("Don't run before %s", dont_run_before.isoformat())
        sleep_seconds = dont_run_before.timestamp() - now().timestamp()
        if sleep_seconds > 0:
            LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds)
            sleep(sleep_seconds)

    states = _find_states(
        job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state"
    )

    running = sorted(sub_dir for sub_dir, state in states.items() if state == "running")

    if len(running) > 1:
        LOGGER.warning(
            "Found %d running jobs %s, please check and fix!", len(running), running
        )
        return

    if running:
        LOGGER.info("Found a running job <%s>, skipping...", running[0])
        return

    resumable = sorted(
        sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES
    )

    if len(resumable) > 1:
        LOGGER.warning(
            "Found %d resumable jobs %s, please check and fix!",
            len(resumable),
            resumable,
        )
        return

    if resumable:
        LOGGER.info("Resuming previous job <%s>", resumable[0])

    job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT)
    curr_job = job_dir / job_tag

    command = [
        "scrapy",
        "crawl",
        args.spider,
        "--output",
        str(out_file),
        "--set",
        f"JOBDIR={curr_job}",
        "--set",
        f"DONT_RUN_BEFORE_FILE={dont_run_before_file}",
    ]

    try:
        execute(argv=command)
    finally:
        garbage_collect()
    "PAGE_SIZE": 25,
    "DEFAULT_FILTER_BACKENDS": ("django_filters.rest_framework.DjangoFilterBackend",),
}

# REST proxy
REST_PROXY = {"HOST": "http://news.recommend.games"}

# Custom

RECOMMENDER_PATH = os.path.join(DATA_DIR, "recommender_bgg")
BGA_RECOMMENDER_PATH = os.path.join(DATA_DIR, "recommender_bga")
STAR_PERCENTILES = (0.165, 0.365, 0.615, 0.815, 0.915, 0.965, 0.985, 0.995)

PUBSUB_PUSH_ENABLED = True
PUBSUB_QUEUE_PROJECT = os.getenv("PUBSUB_QUEUE_PROJECT") or os.getenv("GC_PROJECT")
PUBSUB_QUEUE_TOPIC = os.getenv("PUBSUB_QUEUE_TOPIC")

MODEL_UPDATED_FILE = os.path.join(DATA_DIR, "updated_at")
PROJECT_VERSION_FILE = os.path.join(BASE_DIR, "VERSION")

MIN_VOTES_ANCHOR_DATE = "2020-08-01"
MIN_VOTES_SECONDS_PER_STEP = 10 * 24 * 60 * 60  # 10 days

R_G_RANKING_EFFECTIVE_DATE = (
    parse_date(
        os.getenv("R_G_RANKING_EFFECTIVE_DATE"),
        tzinfo=timezone.utc,
    )
    or parse_date("2022-02-22T00:00Z")
)
Exemplo n.º 18
0
class BggRankingsSpider(Spider):
    """BoardGameGeek rankings spider."""

    name = "bgg_rankings"
    allowed_domains = ("boardgamegeek.com", "archive.org")
    bgg_paths = (
        "browser.php?itemtype=game&sortby=rank",
        "rankbrowse.php3",
        "browse/boardgame",
        "top50.htm",
        "top50.php3",
        "topn.php3?count=50",
    )
    bgg_urls = (
        tuple(f"http://boardgamegeek.com/{path}" for path in bgg_paths)
        + tuple(f"https://boardgamegeek.com/{path}" for path in bgg_paths)
        + tuple(f"http://www.boardgamegeek.com/{path}" for path in bgg_paths)
        + tuple(f"https://www.boardgamegeek.com/{path}" for path in bgg_paths)
    )
    start_urls = (
        tuple(f"https://web.archive.org/web/{{date}}/{url}" for url in bgg_urls)
        + bgg_urls
    )
    item_classes = (GameItem,)

    custom_settings = {
        "DOWNLOAD_DELAY": 0.5,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 8,
        "AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
        "DELAYED_RETRY_ENABLED": True,
        "DELAYED_RETRY_HTTP_CODES": (202,),
        "DELAYED_RETRY_DELAY": 5.0,
        "AUTOTHROTTLE_HTTP_CODES": (429, 503, 504),
        "START_DATE": parse_date(os.getenv("START_DATE"), tzinfo=timezone.utc),
        "EARLIEST_DATE": parse_date(os.getenv("EARLIEST_DATE"), tzinfo=timezone.utc)
        or datetime(2000, 1, 1, tzinfo=timezone.utc),
        "LATEST_DATE": parse_date(os.getenv("LATEST_DATE"), tzinfo=timezone.utc),
    }

    def start_requests(self):
        """Generate start requests."""

        start_date = parse_date(self.settings.get("START_DATE"), tzinfo=timezone.utc)

        if not start_date:
            earliest_date = (
                parse_date(self.settings.get("EARLIEST_DATE"), tzinfo=timezone.utc)
                or now()
            )
            latest_date = (
                parse_date(self.settings.get("LATEST_DATE"), tzinfo=timezone.utc)
                or now()
            )
            start_date_ts = randint(earliest_date.timestamp(), latest_date.timestamp())
            start_date = datetime.fromtimestamp(start_date_ts, tz=timezone.utc)

        self.logger.info("Start date: %s", start_date)

        start_date_str = start_date.strftime(WEB_ARCHIVE_DATE_FORMAT)

        for start_url in self.start_urls:
            yield Request(
                url=start_url.format(date=start_date_str),
                callback=self.parse,
                priority=1,
            )

    def parse(self, response):
        """
        @url https://boardgamegeek.com/browse/boardgame
        @returns items 100 100
        @returns requests 12 12
        """

        scraped_at = now()
        published_at = (
            _extract_date(response.url)
            or response.meta.get("published_at")
            or scraped_at
        )

        for next_page in response.xpath(
            "//a[contains(@title, 'page')]/@href"
        ).extract():
            yield response.follow(
                url=next_page,
                callback=self.parse,
                priority=1,
                meta={"published_at": published_at, "max_retry_times": 10},
            )

        for row in response.css("table#collectionitems tr"):
            link = row.css("td.collection_objectname a::attr(href)").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            year = _parse_int(
                element=row,
                css="td.collection_objectname span.smallerfont.dull",
                lenient=True,
            )
            image_url = row.css(
                "td.collection_thumbnail img::attr(src)"
            ).extract_first()
            image_url = [response.urljoin(image_url)] if image_url else None

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    year=year,
                    image_url=image_url,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_css("rank", "td.collection_rank")
            ldr.add_css("name", "td.collection_objectname a")

            values = row.css("td.collection_bggrating").extract()
            if len(values) == 3:
                ldr.add_value("bayes_rating", values[0])
                ldr.add_value("avg_rating", values[1])
                ldr.add_value("num_votes", values[2])

            yield ldr.load_item()

        for row in response.css("div.simplebox table tr"):
            cells = row.xpath("td")

            if len(cells) != 3:
                continue

            link = cells[1].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("rank", "td[1]")
            ldr.add_xpath("name", "td[2]")
            ldr.add_xpath("bayes_rating", "td[3]")

            yield ldr.load_item()

        for row in response.css("table.gamebrowser_table tr"):
            cells = row.xpath("td")

            if len(cells) < 5:
                continue

            link = cells[2].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            image_url = row.xpath("td[2]//img/@src").extract_first()
            image_url = [response.urljoin(image_url)] if image_url else None

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    image_url=image_url,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("rank", "td[1]")
            ldr.add_xpath("name", "td[3]")
            ldr.add_xpath("bayes_rating", "td[4]")
            ldr.add_xpath("num_votes", "td[5]")

            yield ldr.load_item()

        # Parse Top 50 page: top50.htm, top50.php3, topn.php3?count=50
        for row in response.xpath(
            "//table[tr/td[h3 and contains(., 'Bayesian Average')]]/tr"
        ):
            cells = row.xpath("td")

            if len(cells) < 4:
                continue

            link = cells[1].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)
            rank = _parse_int(cells[0], xpath="text()", lenient=True)

            if not bgg_id or not rank:
                continue

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    rank=rank,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("name", "td[2]")
            ldr.add_xpath("bayes_rating", "td[3]")
            ldr.add_xpath("num_votes", "td[4]")

            yield ldr.load_item()

        for anchor in response.xpath(
            "//div[@id = 'wm-ipp']//table//a[@title and @href]"
        ):
            if parse_date(anchor.xpath("@title").extract_first()):
                yield response.follow(
                    url=anchor.xpath("@href").extract_first(),
                    callback=self.parse,
                    priority=-1,
                    meta={"max_retry_times": 10},
                )
Exemplo n.º 19
0
    def parse(self, response):
        """
        @url https://boardgamegeek.com/browse/boardgame
        @returns items 100 100
        @returns requests 12 12
        """

        scraped_at = now()
        published_at = (
            _extract_date(response.url)
            or response.meta.get("published_at")
            or scraped_at
        )

        for next_page in response.xpath(
            "//a[contains(@title, 'page')]/@href"
        ).extract():
            yield response.follow(
                url=next_page,
                callback=self.parse,
                priority=1,
                meta={"published_at": published_at, "max_retry_times": 10},
            )

        for row in response.css("table#collectionitems tr"):
            link = row.css("td.collection_objectname a::attr(href)").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            year = _parse_int(
                element=row,
                css="td.collection_objectname span.smallerfont.dull",
                lenient=True,
            )
            image_url = row.css(
                "td.collection_thumbnail img::attr(src)"
            ).extract_first()
            image_url = [response.urljoin(image_url)] if image_url else None

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    year=year,
                    image_url=image_url,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_css("rank", "td.collection_rank")
            ldr.add_css("name", "td.collection_objectname a")

            values = row.css("td.collection_bggrating").extract()
            if len(values) == 3:
                ldr.add_value("bayes_rating", values[0])
                ldr.add_value("avg_rating", values[1])
                ldr.add_value("num_votes", values[2])

            yield ldr.load_item()

        for row in response.css("div.simplebox table tr"):
            cells = row.xpath("td")

            if len(cells) != 3:
                continue

            link = cells[1].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("rank", "td[1]")
            ldr.add_xpath("name", "td[2]")
            ldr.add_xpath("bayes_rating", "td[3]")

            yield ldr.load_item()

        for row in response.css("table.gamebrowser_table tr"):
            cells = row.xpath("td")

            if len(cells) < 5:
                continue

            link = cells[2].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)

            if not bgg_id:
                continue

            image_url = row.xpath("td[2]//img/@src").extract_first()
            image_url = [response.urljoin(image_url)] if image_url else None

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    image_url=image_url,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("rank", "td[1]")
            ldr.add_xpath("name", "td[3]")
            ldr.add_xpath("bayes_rating", "td[4]")
            ldr.add_xpath("num_votes", "td[5]")

            yield ldr.load_item()

        # Parse Top 50 page: top50.htm, top50.php3, topn.php3?count=50
        for row in response.xpath(
            "//table[tr/td[h3 and contains(., 'Bayesian Average')]]/tr"
        ):
            cells = row.xpath("td")

            if len(cells) < 4:
                continue

            link = cells[1].xpath("a/@href").extract_first()
            link = response.urljoin(link)
            bgg_id = _extract_bgg_id(link)
            rank = _parse_int(cells[0], xpath="text()", lenient=True)

            if not bgg_id or not rank:
                continue

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    rank=rank,
                    published_at=published_at,
                    scraped_at=scraped_at,
                ),
                selector=row,
                response=response,
            )

            ldr.add_xpath("name", "td[2]")
            ldr.add_xpath("bayes_rating", "td[3]")
            ldr.add_xpath("num_votes", "td[4]")

            yield ldr.load_item()

        for anchor in response.xpath(
            "//div[@id = 'wm-ipp']//table//a[@title and @href]"
        ):
            if parse_date(anchor.xpath("@title").extract_first()):
                yield response.follow(
                    url=anchor.xpath("@href").extract_first(),
                    callback=self.parse,
                    priority=-1,
                    meta={"max_retry_times": 10},
                )
def _extract_date(path_file, tzinfo=timezone.utc):
    file_name = os.path.basename(path_file)
    date_str, _ = os.path.splitext(file_name)
    return parse_date(date_str, tzinfo=tzinfo)
def _following(date, week_day="SUN", tzinfo=timezone.utc):
    date = parse_date(date, tzinfo=tzinfo).astimezone(tzinfo)
    instruction = _make_instruction(week_day)
    return snap(date, instruction).date()
Exemplo n.º 22
0
def serialize_date(date, tzinfo=None):
    """seralize a date into ISO format if possible"""
    parsed = parse_date(date, tzinfo)
    return parsed.strftime("%Y-%m-%dT%T%z") if parsed else str(
        date) if date else None
Exemplo n.º 23
0
def _process_row(row):
    row["published_at"] = parse_date(row.get("published_at"))
    return row