def start_requests(self): """Generate start requests.""" start_date = parse_date(self.settings.get("START_DATE"), tzinfo=timezone.utc) if not start_date: earliest_date = ( parse_date(self.settings.get("EARLIEST_DATE"), tzinfo=timezone.utc) or now() ) latest_date = ( parse_date(self.settings.get("LATEST_DATE"), tzinfo=timezone.utc) or now() ) start_date_ts = randint(earliest_date.timestamp(), latest_date.timestamp()) start_date = datetime.fromtimestamp(start_date_ts, tz=timezone.utc) self.logger.info("Start date: %s", start_date) start_date_str = start_date.strftime(WEB_ARCHIVE_DATE_FORMAT) for start_url in self.start_urls: yield Request( url=start_url.format(date=start_date_str), callback=self.parse, priority=1, )
def _find_latest_ranking( path_dir: Path, glob: str = "*.csv", star_percentiles=Optional[Iterable[float]], ) -> tc.SFrame: path_dir = path_dir.resolve() LOGGER.info("Searching <%s> for latest ranking", path_dir) path_file = max( path_dir.glob(glob), key=lambda p: parse_date(p.stem, tzinfo=timezone.utc), ) LOGGER.info("Loading ranking from <%s>", path_file) recommendations = tc.SFrame.read_csv(str(path_file))["rank", "bgg_id", "score"] if star_percentiles: buckets = tuple( percentile_buckets(recommendations["score"], star_percentiles)) recommendations["stars"] = [ star_rating(score=score, buckets=buckets, low=1.0, high=5.0) for score in recommendations["score"] ] return recommendations
def serialize_date(date: Any, tzinfo: Optional[timezone] = None) -> Optional[str]: """seralize a date into ISO format if possible""" parsed = parse_date(date, tzinfo) return parsed.strftime("%Y-%m-%dT%T%z") if parsed else str( date) if date else None
def parse(self, response): """ @url https://www.boardgamegeek.com/geeklist/30543/bgg-top-50-statistics-meta-list @returns items 0 0 @returns requests 26 """ for next_page in response.xpath( "//a[contains(@title, 'page')]/@href").extract(): yield response.follow( url=next_page, callback=self.parse, ) scraped_at = now() for title in (response.xpath("/html/head/title/text()").extract() + response.css("div.geeklist_title::text").extract()): match = TITLE_REGEX.match(title) published_at = (parse_date(match.group(2), tzinfo=timezone.utc) if match else None) if published_at: break else: published_at = None for item in response.xpath("//*[@data-objecttype = 'listitem']"): result = self.parse_item( item=item, response=response, published_at=published_at, scraped_at=scraped_at, ) if result: yield result
def serialize_date(date: Any, tzinfo: Optional[timezone] = None) -> Optional[str]: """Seralize a date into ISO format if possible.""" parsed = parse_date(date, tzinfo) return ( parsed.isoformat(timespec="seconds") if parsed else str(date) if date else None )
def main(): """Command line entry point.""" args = _parse_args() logging.basicConfig( stream=sys.stderr, level=logging.DEBUG if args.verbose > 0 else logging.INFO, format="%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s", ) LOGGER.info(args) dont_run_before = parse_date( args.dont_run_before, tzinfo=timezone.utc ) or date_from_file(args.dont_run_before, tzinfo=timezone.utc) if dont_run_before: LOGGER.info("Don't run before %s", dont_run_before.isoformat()) sleep_seconds = dont_run_before.timestamp() - now().timestamp() if sleep_seconds > 0: LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds) sleep(sleep_seconds) if args.interval and args.dont_run_before and not parse_date(args.dont_run_before): dont_run_before = now() + timedelta(seconds=args.interval) LOGGER.info( "Don't run next time before %s, writing tag to <%s>", dont_run_before.isoformat(), args.dont_run_before, ) with open(args.dont_run_before, "w") as file_obj: file_obj.write(dont_run_before.isoformat()) update_news( s3_src=f"s3://{args.src_bucket}/", path_feeds=args.feeds, path_merged=args.merged, path_split=args.split, s3_dst=f"s3://{args.dst_bucket}/", split_size=args.split_size, log_level="DEBUG" if args.verbose > 1 else "INFO" if args.verbose > 0 else "WARN", )
def rankings(self, request, pk=None): """Find historical rankings of a game.""" filters = { "game": pk, "ranking_type__in": clear_list(_extract_params(request, "ranking_type")), "date__gte": parse_date( request.query_params.get("date__gte"), tzinfo=timezone.utc ), "date__lte": parse_date( request.query_params.get("date__lte"), tzinfo=timezone.utc ), } filters = {k: v for k, v in filters.items() if v} queryset = Ranking.objects.filter(**filters) serializer = RankingSerializer( queryset, many=True, context=self.get_serializer_context() ) return Response(serializer.data)
def model_updated_at(file_path=settings.MODEL_UPDATED_FILE): """latest model update""" try: with open(file_path) as file_obj: updated_at = file_obj.read() updated_at = normalize_space(updated_at) return parse_date(updated_at, tzinfo=timezone.utc) except Exception: pass return None
def history(self, request): """History of the top rankings.""" top = parse_int(request.query_params.get("top")) or 100 ranking_type = request.query_params.get("ranking_type") or Ranking.BGG filters = { "ranking_type": ranking_type, "date__gte": parse_date( request.query_params.get("date__gte"), tzinfo=timezone.utc ), "date__lte": parse_date( request.query_params.get("date__lte"), tzinfo=timezone.utc ), } filters = {k: v for k, v in filters.items() if v} queryset = Ranking.objects.filter(**filters) last_date = queryset.filter(rank=1).dates("date", "day", order="ASC").last() games = [ r.game for r in queryset.filter(date=last_date, rank__lte=top) .order_by("rank") .select_related("game") ] assert len(games) == top game_ids = frozenset(g.bgg_id for g in games) rankings = queryset.filter(game__in=game_ids).order_by("date") data = [ { "game": self.get_serializer(game).data, "rankings": RankingSerializer( rankings.filter(game=game.bgg_id), many=True, context=self.get_serializer_context(), ).data, } for game in games ] return Response(data)
def __init__(self, tag_file, date=None, seconds=None): date = parse_date(date, tzinfo=timezone.utc) seconds = parse_float(seconds) if not date and not seconds: raise NotConfigured self.tag_file = Path(tag_file).resolve() self.tag_file.parent.mkdir(parents=True, exist_ok=True) self.date = date self.seconds = seconds
def _rating_data( recommender_path=getattr(settings, "RECOMMENDER_PATH", None), pk_field="bgg_id", rankings_path=None, r_g_ranking_effective_date=getattr(settings, "R_G_RANKING_EFFECTIVE_DATE", None), ): recommender = load_recommender(recommender_path, "bgg") if not recommender: return {} r_g_ranking_effective_date = parse_date( r_g_ranking_effective_date, tzinfo=timezone.utc, ) if (rankings_path and r_g_ranking_effective_date and now() >= r_g_ranking_effective_date): LOGGER.info( "Using new R.G ranking effective from %s", r_g_ranking_effective_date, ) recommendations = _find_latest_ranking( path_dir=Path(rankings_path), star_percentiles=getattr(settings, "STAR_PERCENTILES", None), ) else: recommendations = recommender.recommend(star_percentiles=getattr( settings, "STAR_PERCENTILES", None), ) LOGGER.info("Loaded recommendations for %d games", len(recommendations)) count = -1 result = {} for count, game in enumerate(recommendations): if count and count % 1000 == 0: LOGGER.info("processed %d items so far", count) pkey = game.get(pk_field) if pkey is None: continue result[pkey] = { "rec_rank": game.get("rank"), "rec_rating": game.get("score"), "rec_stars": game.get("stars"), } LOGGER.info("processed %d items in total", count) return result
def _parse_date(date, tzinfo=timezone.utc, format_str=WEB_ARCHIVE_DATE_FORMAT): try: date = datetime.strptime(date, format_str) return date.replace(tzinfo=tzinfo) except Exception: pass try: return parse_date(date, tzinfo, format_str) except Exception: pass return None
def _local_requests(self, path_dir="."): path_dir = Path(path_dir).resolve() for path_file in path_dir.iterdir(): if not path_file.is_file(): continue self.logger.info("Processing <%s>", path_file) date = parse_date(path_file.stem, tzinfo=timezone.utc) yield Request(url=path_file.as_uri(), callback=self.parse, meta={"published_at": date})
def from_crawler(cls, crawler): """ init from crawler """ tag_file = crawler.settings.get("DONT_RUN_BEFORE_FILE") date = parse_date( crawler.settings.get("DONT_RUN_BEFORE_DATE"), tzinfo=timezone.utc ) seconds = crawler.settings.getfloat("DONT_RUN_BEFORE_SEC") if not tag_file or not (seconds or date): raise NotConfigured obj = cls(tag_file, date, seconds) crawler.signals.connect(obj._spider_opened, signals.spider_opened) return obj
def date_from_file( path: Union[bytes, str, os.PathLike, None], tzinfo: Optional[timezone] = None, format_str: Optional[str] = None, ) -> Optional[datetime]: """Parse a date from a file.""" if not path: return None path = Path(path).resolve() LOGGER.info("Reading date from path <%s>", path) try: with path.open() as file_obj: date = normalize_space(next(file_obj, None)) except Exception: date = None return parse_date(date=date, tzinfo=tzinfo, format_str=format_str)
def main(): """Command line entry point.""" settings = get_project_settings() configure_logging(settings) args = _parse_args() LOGGER.info(args) base_dir = Path(settings["BASE_DIR"]).resolve() cache_dir = base_dir / ".scrapy" / "httpcache" feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds" feeds_dir = feeds_dir.resolve() feeds_dir_scraper = ( feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider ) file_tag = normalize_space(args.file_tag) out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl" LOGGER.info("Output file will be <%s>", out_file) from_settings = job_dir_from_settings(settings) job_dir = ( Path(args.job_dir) if args.job_dir else Path(from_settings) if from_settings else base_dir / "jobs" / args.spider ) job_dir = job_dir.resolve() cache_dir.mkdir(parents=True, exist_ok=True) feeds_dir_scraper.mkdir(parents=True, exist_ok=True) job_dir.mkdir(parents=True, exist_ok=True) dont_run_before_file = job_dir / ".dont_run_before" dont_run_before = parse_date( args.dont_run_before, tzinfo=timezone.utc ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc) if dont_run_before: LOGGER.info("Don't run before %s", dont_run_before.isoformat()) sleep_seconds = dont_run_before.timestamp() - now().timestamp() if sleep_seconds > 0: LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds) sleep(sleep_seconds) states = _find_states( job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state" ) running = sorted(sub_dir for sub_dir, state in states.items() if state == "running") if len(running) > 1: LOGGER.warning( "Found %d running jobs %s, please check and fix!", len(running), running ) return if running: LOGGER.info("Found a running job <%s>, skipping...", running[0]) return resumable = sorted( sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES ) if len(resumable) > 1: LOGGER.warning( "Found %d resumable jobs %s, please check and fix!", len(resumable), resumable, ) return if resumable: LOGGER.info("Resuming previous job <%s>", resumable[0]) job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT) curr_job = job_dir / job_tag command = [ "scrapy", "crawl", args.spider, "--output", str(out_file), "--set", f"JOBDIR={curr_job}", "--set", f"DONT_RUN_BEFORE_FILE={dont_run_before_file}", ] try: execute(argv=command) finally: garbage_collect()
"PAGE_SIZE": 25, "DEFAULT_FILTER_BACKENDS": ("django_filters.rest_framework.DjangoFilterBackend",), } # REST proxy REST_PROXY = {"HOST": "http://news.recommend.games"} # Custom RECOMMENDER_PATH = os.path.join(DATA_DIR, "recommender_bgg") BGA_RECOMMENDER_PATH = os.path.join(DATA_DIR, "recommender_bga") STAR_PERCENTILES = (0.165, 0.365, 0.615, 0.815, 0.915, 0.965, 0.985, 0.995) PUBSUB_PUSH_ENABLED = True PUBSUB_QUEUE_PROJECT = os.getenv("PUBSUB_QUEUE_PROJECT") or os.getenv("GC_PROJECT") PUBSUB_QUEUE_TOPIC = os.getenv("PUBSUB_QUEUE_TOPIC") MODEL_UPDATED_FILE = os.path.join(DATA_DIR, "updated_at") PROJECT_VERSION_FILE = os.path.join(BASE_DIR, "VERSION") MIN_VOTES_ANCHOR_DATE = "2020-08-01" MIN_VOTES_SECONDS_PER_STEP = 10 * 24 * 60 * 60 # 10 days R_G_RANKING_EFFECTIVE_DATE = ( parse_date( os.getenv("R_G_RANKING_EFFECTIVE_DATE"), tzinfo=timezone.utc, ) or parse_date("2022-02-22T00:00Z") )
class BggRankingsSpider(Spider): """BoardGameGeek rankings spider.""" name = "bgg_rankings" allowed_domains = ("boardgamegeek.com", "archive.org") bgg_paths = ( "browser.php?itemtype=game&sortby=rank", "rankbrowse.php3", "browse/boardgame", "top50.htm", "top50.php3", "topn.php3?count=50", ) bgg_urls = ( tuple(f"http://boardgamegeek.com/{path}" for path in bgg_paths) + tuple(f"https://boardgamegeek.com/{path}" for path in bgg_paths) + tuple(f"http://www.boardgamegeek.com/{path}" for path in bgg_paths) + tuple(f"https://www.boardgamegeek.com/{path}" for path in bgg_paths) ) start_urls = ( tuple(f"https://web.archive.org/web/{{date}}/{url}" for url in bgg_urls) + bgg_urls ) item_classes = (GameItem,) custom_settings = { "DOWNLOAD_DELAY": 0.5, "CONCURRENT_REQUESTS_PER_DOMAIN": 8, "AUTOTHROTTLE_TARGET_CONCURRENCY": 4, "DELAYED_RETRY_ENABLED": True, "DELAYED_RETRY_HTTP_CODES": (202,), "DELAYED_RETRY_DELAY": 5.0, "AUTOTHROTTLE_HTTP_CODES": (429, 503, 504), "START_DATE": parse_date(os.getenv("START_DATE"), tzinfo=timezone.utc), "EARLIEST_DATE": parse_date(os.getenv("EARLIEST_DATE"), tzinfo=timezone.utc) or datetime(2000, 1, 1, tzinfo=timezone.utc), "LATEST_DATE": parse_date(os.getenv("LATEST_DATE"), tzinfo=timezone.utc), } def start_requests(self): """Generate start requests.""" start_date = parse_date(self.settings.get("START_DATE"), tzinfo=timezone.utc) if not start_date: earliest_date = ( parse_date(self.settings.get("EARLIEST_DATE"), tzinfo=timezone.utc) or now() ) latest_date = ( parse_date(self.settings.get("LATEST_DATE"), tzinfo=timezone.utc) or now() ) start_date_ts = randint(earliest_date.timestamp(), latest_date.timestamp()) start_date = datetime.fromtimestamp(start_date_ts, tz=timezone.utc) self.logger.info("Start date: %s", start_date) start_date_str = start_date.strftime(WEB_ARCHIVE_DATE_FORMAT) for start_url in self.start_urls: yield Request( url=start_url.format(date=start_date_str), callback=self.parse, priority=1, ) def parse(self, response): """ @url https://boardgamegeek.com/browse/boardgame @returns items 100 100 @returns requests 12 12 """ scraped_at = now() published_at = ( _extract_date(response.url) or response.meta.get("published_at") or scraped_at ) for next_page in response.xpath( "//a[contains(@title, 'page')]/@href" ).extract(): yield response.follow( url=next_page, callback=self.parse, priority=1, meta={"published_at": published_at, "max_retry_times": 10}, ) for row in response.css("table#collectionitems tr"): link = row.css("td.collection_objectname a::attr(href)").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue year = _parse_int( element=row, css="td.collection_objectname span.smallerfont.dull", lenient=True, ) image_url = row.css( "td.collection_thumbnail img::attr(src)" ).extract_first() image_url = [response.urljoin(image_url)] if image_url else None ldr = GameLoader( item=GameItem( bgg_id=bgg_id, year=year, image_url=image_url, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_css("rank", "td.collection_rank") ldr.add_css("name", "td.collection_objectname a") values = row.css("td.collection_bggrating").extract() if len(values) == 3: ldr.add_value("bayes_rating", values[0]) ldr.add_value("avg_rating", values[1]) ldr.add_value("num_votes", values[2]) yield ldr.load_item() for row in response.css("div.simplebox table tr"): cells = row.xpath("td") if len(cells) != 3: continue link = cells[1].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue ldr = GameLoader( item=GameItem( bgg_id=bgg_id, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("rank", "td[1]") ldr.add_xpath("name", "td[2]") ldr.add_xpath("bayes_rating", "td[3]") yield ldr.load_item() for row in response.css("table.gamebrowser_table tr"): cells = row.xpath("td") if len(cells) < 5: continue link = cells[2].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue image_url = row.xpath("td[2]//img/@src").extract_first() image_url = [response.urljoin(image_url)] if image_url else None ldr = GameLoader( item=GameItem( bgg_id=bgg_id, image_url=image_url, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("rank", "td[1]") ldr.add_xpath("name", "td[3]") ldr.add_xpath("bayes_rating", "td[4]") ldr.add_xpath("num_votes", "td[5]") yield ldr.load_item() # Parse Top 50 page: top50.htm, top50.php3, topn.php3?count=50 for row in response.xpath( "//table[tr/td[h3 and contains(., 'Bayesian Average')]]/tr" ): cells = row.xpath("td") if len(cells) < 4: continue link = cells[1].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) rank = _parse_int(cells[0], xpath="text()", lenient=True) if not bgg_id or not rank: continue ldr = GameLoader( item=GameItem( bgg_id=bgg_id, rank=rank, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("name", "td[2]") ldr.add_xpath("bayes_rating", "td[3]") ldr.add_xpath("num_votes", "td[4]") yield ldr.load_item() for anchor in response.xpath( "//div[@id = 'wm-ipp']//table//a[@title and @href]" ): if parse_date(anchor.xpath("@title").extract_first()): yield response.follow( url=anchor.xpath("@href").extract_first(), callback=self.parse, priority=-1, meta={"max_retry_times": 10}, )
def parse(self, response): """ @url https://boardgamegeek.com/browse/boardgame @returns items 100 100 @returns requests 12 12 """ scraped_at = now() published_at = ( _extract_date(response.url) or response.meta.get("published_at") or scraped_at ) for next_page in response.xpath( "//a[contains(@title, 'page')]/@href" ).extract(): yield response.follow( url=next_page, callback=self.parse, priority=1, meta={"published_at": published_at, "max_retry_times": 10}, ) for row in response.css("table#collectionitems tr"): link = row.css("td.collection_objectname a::attr(href)").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue year = _parse_int( element=row, css="td.collection_objectname span.smallerfont.dull", lenient=True, ) image_url = row.css( "td.collection_thumbnail img::attr(src)" ).extract_first() image_url = [response.urljoin(image_url)] if image_url else None ldr = GameLoader( item=GameItem( bgg_id=bgg_id, year=year, image_url=image_url, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_css("rank", "td.collection_rank") ldr.add_css("name", "td.collection_objectname a") values = row.css("td.collection_bggrating").extract() if len(values) == 3: ldr.add_value("bayes_rating", values[0]) ldr.add_value("avg_rating", values[1]) ldr.add_value("num_votes", values[2]) yield ldr.load_item() for row in response.css("div.simplebox table tr"): cells = row.xpath("td") if len(cells) != 3: continue link = cells[1].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue ldr = GameLoader( item=GameItem( bgg_id=bgg_id, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("rank", "td[1]") ldr.add_xpath("name", "td[2]") ldr.add_xpath("bayes_rating", "td[3]") yield ldr.load_item() for row in response.css("table.gamebrowser_table tr"): cells = row.xpath("td") if len(cells) < 5: continue link = cells[2].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) if not bgg_id: continue image_url = row.xpath("td[2]//img/@src").extract_first() image_url = [response.urljoin(image_url)] if image_url else None ldr = GameLoader( item=GameItem( bgg_id=bgg_id, image_url=image_url, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("rank", "td[1]") ldr.add_xpath("name", "td[3]") ldr.add_xpath("bayes_rating", "td[4]") ldr.add_xpath("num_votes", "td[5]") yield ldr.load_item() # Parse Top 50 page: top50.htm, top50.php3, topn.php3?count=50 for row in response.xpath( "//table[tr/td[h3 and contains(., 'Bayesian Average')]]/tr" ): cells = row.xpath("td") if len(cells) < 4: continue link = cells[1].xpath("a/@href").extract_first() link = response.urljoin(link) bgg_id = _extract_bgg_id(link) rank = _parse_int(cells[0], xpath="text()", lenient=True) if not bgg_id or not rank: continue ldr = GameLoader( item=GameItem( bgg_id=bgg_id, rank=rank, published_at=published_at, scraped_at=scraped_at, ), selector=row, response=response, ) ldr.add_xpath("name", "td[2]") ldr.add_xpath("bayes_rating", "td[3]") ldr.add_xpath("num_votes", "td[4]") yield ldr.load_item() for anchor in response.xpath( "//div[@id = 'wm-ipp']//table//a[@title and @href]" ): if parse_date(anchor.xpath("@title").extract_first()): yield response.follow( url=anchor.xpath("@href").extract_first(), callback=self.parse, priority=-1, meta={"max_retry_times": 10}, )
def _extract_date(path_file, tzinfo=timezone.utc): file_name = os.path.basename(path_file) date_str, _ = os.path.splitext(file_name) return parse_date(date_str, tzinfo=tzinfo)
def _following(date, week_day="SUN", tzinfo=timezone.utc): date = parse_date(date, tzinfo=tzinfo).astimezone(tzinfo) instruction = _make_instruction(week_day) return snap(date, instruction).date()
def serialize_date(date, tzinfo=None): """seralize a date into ISO format if possible""" parsed = parse_date(date, tzinfo) return parsed.strftime("%Y-%m-%dT%T%z") if parsed else str( date) if date else None
def _process_row(row): row["published_at"] = parse_date(row.get("published_at")) return row