Пример #1
0
def _parse_game(game):
    for field in DEDUPE_FIELDS:
        game.setdefault(field["field"], None)
        if field["type"] == "Set":
            game[field["field"]] = tuple(arg_to_iter(
                game[field["field"]])) or None
    game["names"] = tuple(
        clear_list(
            chain(arg_to_iter(game.get("name")),
                  arg_to_iter(game.get("alt_name")))))
    for field in VALUE_ID_FIELDS:
        game[field] = tuple(
            clear_list(map(_parse_value_id, arg_to_iter(game.get(field)))))
    return game
Пример #2
0
    def parse(self, response):
        """
        @url https://boardgamegeek.com/browse/boardgame/
        @returns items 0 0
        @returns requests 11
        """

        next_page = response.xpath(
            '//a[@title = "next page"]/@href').extract_first()
        if next_page:
            yield Request(
                response.urljoin(next_page),
                callback=self.parse,
                priority=1,
                meta={"max_retry_times": 10},
            )

        urls = response.xpath("//@href").extract()
        bgg_ids = filter(None, map(extract_bgg_id, map(response.urljoin,
                                                       urls)))
        yield from self._game_requests(*bgg_ids)

        user_names = filter(None, map(extract_bgg_user_name, urls))
        scraped_at = now()

        for user_name in clear_list(user_names):
            yield self.collection_request(
                user_name
            ) if self.scrape_collections else self._user_item_or_request(
                user_name, scraped_at=scraped_at)
Пример #3
0
 def process_item(self, item, spider):
     """ resolve resource image URLs to actual file locations """
     for field in self.fields:
         if item.get(field):
             item[field] = clear_list(
                 map(self._parse_url, arg_to_iter(item[field])))
     return item
    def dates(self, request):
        """Find all available dates with rankings."""

        query_set = self.get_queryset().order_by("ranking_type", "date")

        ranking_types = clear_list(_extract_params(request, "ranking_type"))
        if ranking_types:
            query_set = query_set.filter(ranking_type__in=ranking_types)

        return Response(query_set.values("ranking_type", "date").distinct())
Пример #5
0
def extract_ids(*urls: Optional[str]) -> Dict[str, List[Union[int, str]]]:
    """ extract all possible IDs from all the URLs """
    urls = tuple(map(urlparse, urls))
    return {
        "bgg_id": clear_list(map(extract_bgg_id, urls)),
        "freebase_id": clear_list(map(extract_freebase_id, urls)),
        "wikidata_id": clear_list(map(extract_wikidata_id, urls)),
        "wikipedia_id": clear_list(map(extract_wikipedia_id, urls)),
        "dbpedia_id": clear_list(map(extract_dbpedia_id, urls)),
        "luding_id": clear_list(map(extract_luding_id, urls)),
        "spielen_id": clear_list(map(extract_spielen_id, urls)),
        "bga_id": clear_list(map(extract_bga_id, urls)),
    }
Пример #6
0
def _process_df(data_frame,
                columns=None,
                required_columns=None,
                target_column=None):
    if data_frame is None or data_frame.empty:
        LOGGER.error("DataFrame is empty")
        return None

    columns = clear_list(arg_to_iter(columns))
    required_columns = clear_list(arg_to_iter(required_columns)) or columns
    columns = clear_list(columns + required_columns)

    if not columns:
        LOGGER.error("No columns given")
        return None

    missing_columns = [
        column for column in required_columns if column not in data_frame
    ]
    if missing_columns:
        LOGGER.error("DataFrame does not contain the expected columns %s",
                     missing_columns)
        return None

    for column in columns:
        if column not in data_frame:
            data_frame[column] = None

    target_column = target_column or columns[0]
    return (data_frame[columns][data_frame[target_column].notna()].sort_values(
        target_column).rename(columns={
            "bayes_rating": "score"
        }).astype({
            "rank": int,
            "bgg_id": int
        }))
Пример #7
0
    def _extract_labels(self, response, value):
        json_obj = parse_json(response.text) if hasattr(response, "text") else {}

        labels = take_first(jmespath.search(f"entities.{value}.labels", json_obj)) or {}
        labels = labels.values()
        labels = sorted(
            labels,
            key=lambda label: self.lang_priorities.get(label.get("language"), math.inf),
        )
        labels = clear_list(label.get("value") for label in labels)

        self.labels[value] = labels
        self.logger.debug("resolved labels for %s: %s", value, labels)

        return labels
Пример #8
0
    def _game_requests(self,
                       *bgg_ids,
                       batch_size=10,
                       page=1,
                       priority=0,
                       **kwargs):
        bgg_ids = clear_list(map(parse_int, bgg_ids))

        if not bgg_ids:
            return

        bgg_ids = ((bgg_id for bgg_id in bgg_ids
                    if bgg_id not in self._ids_seen) if page == 1 else bgg_ids)

        for batch in batchify(bgg_ids, batch_size):
            batch = tuple(batch)

            ids = ",".join(map(str, batch))

            url = (self._api_url(
                action="thing",
                id=ids,
                stats=1,
                videos=1,
                versions=int(self.scrape_ratings),
                ratingcomments=int(self.scrape_ratings),
                page=1,
            ) if page == 1 else self._api_url(action="thing",
                                              id=ids,
                                              versions=1,
                                              ratingcomments=1,
                                              page=page))

            request = Request(url, callback=self.parse_game, priority=priority)

            if len(batch) == 1:
                request.meta["bgg_id"] = batch[0]
            request.meta["page"] = page
            request.meta.update(kwargs)

            yield request

            if page == 1:
                self._ids_seen.update(batch)
    def rankings(self, request, pk=None):
        """Find historical rankings of a game."""

        filters = {
            "game": pk,
            "ranking_type__in": clear_list(_extract_params(request, "ranking_type")),
            "date__gte": parse_date(
                request.query_params.get("date__gte"), tzinfo=timezone.utc
            ),
            "date__lte": parse_date(
                request.query_params.get("date__lte"), tzinfo=timezone.utc
            ),
        }
        filters = {k: v for k, v in filters.items() if v}
        queryset = Ranking.objects.filter(**filters)
        serializer = RankingSerializer(
            queryset, many=True, context=self.get_serializer_context()
        )
        return Response(serializer.data)
Пример #10
0
 def _add_value(self, result, field, item):
     labels = clear_list(flatten(r[1] for r in arg_to_iter(result))) or None
     self.logger.debug("resolved labels for %s: %s", item.get(field),
                       labels)
     item[field] = labels
     return item
Пример #11
0
def merge_files(
    in_paths,
    out_path,
    keys="id",
    key_types=None,
    latest=None,
    latest_types=None,
    latest_min=None,
    latest_required=False,
    fieldnames=None,
    fieldnames_exclude=None,
    sort_keys=False,
    sort_latest=False,
    sort_fields=None,
    sort_descending=False,
    concat_output=False,
    log_level=None,
):
    """ merge files into one """

    spark = _spark_session(log_level=log_level)

    if spark is None:
        raise RuntimeError(
            "Please make sure Spark is installed and configured correctly!")

    in_paths = list(map(str, arg_to_iter(in_paths)))

    LOGGER.info(
        "Merging items from %s into <%s> with Spark session %r",
        f"[{len(in_paths) } paths]" if len(in_paths) > 10 else in_paths,
        out_path,
        spark,
    )

    fieldnames = clear_list(arg_to_iter(fieldnames))
    fieldnames_exclude = frozenset(arg_to_iter(fieldnames_exclude))

    if fieldnames and fieldnames_exclude:
        LOGGER.warning(
            "Both <fieldnames> and <fieldnames_exclude> were specified, please choose one"
        )

    sort_fields = tuple(arg_to_iter(sort_fields))
    if sum(map(bool, (sort_keys, sort_latest, sort_fields))) > 1:
        LOGGER.warning(
            "Only use at most one of <sort_keys>, <sort_latest>, and <sort_fields>"
        )

    keys = tuple(arg_to_iter(keys))
    key_types = tuple(arg_to_iter(key_types))
    key_types += (None, ) * (len(keys) - len(key_types))
    assert len(keys) == len(key_types)
    LOGGER.info("Using keys %s with types %s", keys, key_types)

    latest = tuple(arg_to_iter(latest))
    latest_types = tuple(arg_to_iter(latest_types))
    latest_types += (None, ) * (len(latest) - len(latest_types))
    assert len(latest) == len(latest_types)
    LOGGER.info("Using latest %s with types %s", latest, latest_types)

    data = spark.read.json(path=in_paths,
                           mode="DROPMALFORMED",
                           dropFieldIfAllNull=True)

    key_column_names = [f"_key_{i}" for i in range(len(keys))]
    key_columns = [
        _column_type(data[column], column_type).alias(name)
        for column, column_type, name in zip(keys, key_types, key_column_names)
    ]
    key_columns_str = (column.cast("string") for column in key_columns)
    latest_column_names = [f"_latest_{i}" for i in range(len(latest))]
    latest_columns = [
        _column_type(data[column], column_type).alias(name) for column,
        column_type, name in zip(latest, latest_types, latest_column_names)
    ]
    latest_columns_str = (column.cast("string") for column in latest_columns)

    drop_subset = keys + tuple(key_column_names)
    if latest_required:
        drop_subset += latest + tuple(latest_column_names)
    LOGGER.info("Dropping rows without values in columns %s", drop_subset)

    data = data.select(
        "*",
        *key_columns,
        array(*key_columns_str).alias("_key"),
        *latest_columns,
        array(*latest_columns_str).alias("_latest"),
    ).dropna(how="any", subset=drop_subset)

    if latest_min is not None:
        LOGGER.info("Filter out items before %s", latest_min)
        data = data.filter(latest_columns[0] >= latest_min)

    rdd = (data.rdd.keyBy(lambda row: tuple(arg_to_iter(row["_key"]))).
           reduceByKey(_compare).values())

    data = rdd.toDF(schema=data.schema)

    if sort_keys:
        LOGGER.info(
            "Sorting %s by columns %s",
            "descending" if sort_descending else "ascending",
            keys,
        )
        data = data.sort(*key_column_names, ascending=not sort_descending)
    elif sort_latest:
        LOGGER.info(
            "Sorting %s by columns %s",
            "descending" if sort_descending else "ascending",
            latest,
        )
        data = data.sort(*latest_column_names, ascending=not sort_descending)
    elif sort_fields:
        LOGGER.info(
            "Sorting %s by columns %s",
            "descending" if sort_descending else "ascending",
            sort_fields,
        )
        data = data.sort(*sort_fields, ascending=not sort_descending)

    data = data.drop("_key", *key_column_names, "_latest",
                     *latest_column_names)

    columns = frozenset(data.columns) - fieldnames_exclude
    if fieldnames:
        fieldnames = [column for column in fieldnames if column in columns]
        LOGGER.info("Only use columns: %s", fieldnames)
    else:
        fieldnames = sorted(columns)
        LOGGER.info("Use sorted column names: %s", fieldnames)
    data = data.select(*fieldnames)

    data = _remove_empty(data)

    if concat_output:
        with tempfile.TemporaryDirectory() as temp_path:
            path = Path(temp_path) / "out"
            LOGGER.info("Saving temporary output to <%s>", path)
            data.write.json(path=str(path))

            LOGGER.info("Concatenate temporary files to <%s>", out_path)
            files = path.glob("part-*")
            concat_files(dst=out_path, srcs=sorted(files), ensure_newline=True)

    else:
        LOGGER.info("Saving output to <%s>", out_path)
        data.write.json(path=str(out_path))

    LOGGER.info("Done merging.")