Exemplo n.º 1
0
def export_key_value_records(
    key_value_records: dbag.Bag,
    export_dir: Path,
) -> None:
    """Converts a Dask bag of Dicts into a collection of json files.

  In order to create a lookup table, we must first export all data as json.
  This function maps each element of the input bag to a json encoded string and
  writes one file per partition to the export_dir. WARNING: this function will
  delete any json files already present in export_dir.

  Args:
    key_value_records: A dask bag containing dicts.
    export_dir: The location to write json files. Will erase any if present
      beforehand.

  """
    export_dir = Path(export_dir)
    # Clean up / setup export dir
    export_dir.mkdir(parents=True, exist_ok=True)
    # Remove any previously constructed json files in there
    for json_file in export_dir.glob("*.json"):
        json_file.unlink()
    (key_value_records.map(_record_to_kv_json).to_textfiles(
        f"{export_dir}/*.json"))
Exemplo n.º 2
0
def fetch_page_ids(
    bucket_name: str = S3_CANONICAL_DATA_BUCKET,
    source: str = "issues",
    issue_bag: db.Bag = None,
    n_partitions: int = 100,
) -> db.Bag:

    valid_sources = ["issues", "pages"]
    assert source in valid_sources

    if issue_bag is None:
        issue_bag = fetch_issues(bucket_name,
                                 compute=False).filter(lambda i: len(i) > 0)

    if source == "issues":
        print(f"Fetching page IDs from {source}")
        # no need to recompute the issues
        if issue_bag:
            pass
        else:
            issue_bag = fetch_issues(compute=False)
        return issue_bag.map(lambda i: i["pp"]).flatten()
    else:
        page_files = list_pages(bucket_name)
        return (db.from_sequence(page_files, npartitions=n_partitions).map(
            alternative_read_text, IMPRESSO_STORAGEOPT).flatten().map(
                json.loads).filter(lambda i: len(i) > 0).pluck("id"))
Exemplo n.º 3
0
def create_lookup_table(
    record_bag: dbag.Bag,
    key_field: str,
    value_field: str,
    database_path: Path,
    intermediate_data_dir: Path,
    agatha_install_path: Path,
) -> None:
    database_path = Path(database_path)
    intermediate_data_dir = Path(intermediate_data_dir)
    agatha_install_path = Path(agatha_install_path)
    if not database_path.is_file():
        if not intermediate_data_dir.exists():
            intermediate_data_dir.mkdir(parents=True, exist_ok=True)
        else:
            # Remove any previously constructed json files in there
            print("\t- Removing existing json files from",
                  intermediate_data_dir)
            for json_file in intermediate_data_dir.glob("*.json"):
                json_file.unlink()

        print("\t- Writing intermediate json files")
        (  # Save all keys and values as kv pair json files
            record_bag.map(_record_to_kv_json,
                           key_field=key_field,
                           value_field=value_field).to_textfiles(
                               f"{intermediate_data_dir}/*.json"))
        print("\t- Writing", database_path)
        _make_sqlite3_database_from_json(
            intermediate_data_dir=intermediate_data_dir,
            database_path=database_path,
            agatha_install_path=agatha_install_path)
Exemplo n.º 4
0
def check_duplicated_content_item_IDs(issue_bag: bag.Bag) -> pd.DataFrame:
    """Short summary.

    ..note::
        This is a global check.

    :param bag.Bag issue_bag: Description of parameter `issue_bag`.
    :return: Description of returned object.
    :rtype: pd.DataFrame

    """
    duplicates = (issue_bag.map(
        lambda issue_json: [ci["m"]["id"] for ci in issue_json["i"]]).flatten(
        ).frequencies().filter(lambda i: i[1] > 1).map(
            lambda i: {
                "ci_id": i[0],
                "freq": i[1],
                "newspaper_id": i[0].split("-")[0]
            }).compute())

    if duplicates:
        duplicates_df = pd.DataFrame(duplicates).set_index("ci_id")
    else:
        # there are no duplicates
        duplicates_df = pd.DataFrame(columns=["ci_id", "freq", "newspaper_id"])

    print((f"Found {duplicates_df.shape[0]} duplicated "
           "content item IDs, belonging to "
           f"{duplicates_df.newspaper_id.unique().size} journals"
           f"({', '.join(list(duplicates_df.newspaper_id.unique()))})"))
    return duplicates_df
Exemplo n.º 5
0
def _run_apply(algorithms: Iterable[Algorithm], dataset: Bag) -> Tuple:
    algorithms = list(algorithms)

    def fold(*args):
        return tuple(left + right for left, right in zip(*args))

    reduced = (dataset.map(lambda data: tuple(
        alg.apply(data) for alg in algorithms)).fold(fold).compute())
    assert len(reduced) == len(algorithms)
    return tuple(reduced)
Exemplo n.º 6
0
def _store_bag_as_dataset_parallel(
    bag: db.Bag,
    store: KeyValueStore,
    cube: Cube,
    ktk_cube_dataset_ids: Iterable[str],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    existing_datasets,
    overwrite: bool = False,
    update: bool = False,
    delete_scopes=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to
    store datasets in parallel (e.g. from a dict).

    `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset
    (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`).
    """
    if (not update) and (not overwrite):
        for ktk_cube_dataset_id in ktk_cube_dataset_ids:
            raise_if_dataset_exists(
                dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
                store=store)

    mps = bag.map(_multiplex_parse_input_to_metapartition)

    # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not
    # required here anymore

    mps = mps.map(_multiplex_store,
                  store=store,
                  cube=cube,
                  df_serializer=df_serializer)

    aggregate = partial(
        _multiplex_store_dataset_from_partitions_flat,
        cube=cube,
        existing_datasets=existing_datasets,
        metadata=metadata,
        store=store,
        update=update,
        delete_scopes=delete_scopes or {},
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False,
                         out_type=db.Bag)
Exemplo n.º 7
0
def record_to_bipartite_edges(
    records: dbag.Bag,
    get_neighbor_keys_fn: Callable[[Record], Iterable[str]],
    get_source_key_fn: Callable[[Record], str] = lambda x: x["id"],
    bidirectional: bool = True,
) -> dbag.Bag:
    """
  This function is responsible for extracting edges from records. For example,
  if you had a bag of records, each containing a set of terms, you might want
  to get the set of edges between records and terms.

  Args:
    records: The collection of records we wish to extract edges from.
    get_neighbor_keys_fn: Given a record, return a list of graph keys that
      are adjacent to the given record
    get_source_key_fn: Given a record, return a graph key that uniquely
      identifies the root. By default we get the "id" field
    bidirectional: If true, we write record->neighbor and neighbor->record.
      If false, we only write record->neighbor.

  Returns:
    A bag containing serialized key-value pairs that can be used to create an
    Sqlite3LookupTable

  """
    def _to_kv(recs: Iterable[Record]) -> List[str]:
        "id, neighs to key_value strings"
        # Create graph, remove duplicate edges
        graph = defaultdict(set)
        for r in recs:
            id_ = r["id"]
            for neigh in r["neighs"]:
                graph[id_].add(neigh)
                if bidirectional:
                    graph[neigh].add(id_)
        # Output edges
        res = []
        for source, targets in graph.items():
            for target in targets:
                res.append(json.dumps(dict(key=source, value=target)))
        return res

    return (records.map(lambda r: {
        "id": get_source_key_fn(r),
        "neighs": get_neighbor_keys_fn(r)
    }).map_partitions(_to_kv))
Exemplo n.º 8
0
def build_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that builds a cube with the data supplied from a dask bag.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the
        seed dataset will be written.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)

    if ktk_cube_dataset_ids is None:
        ktk_cube_dataset_ids = [cube.seed_dataset]
    else:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)

    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)
    cube = ensure_valid_cube_indices(existing_datasets, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=set(),
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 9
0
def append_to_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    remove_conditions=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".


    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    remove_conditions:
        Conditions that select which partitions to remove.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload: Set[str] = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    if remove_conditions is not None:
        remove_metapartitions = prepare_metapartitions_for_removal_action(
            cube, store, remove_conditions, ktk_cube_dataset_ids,
            existing_datasets)
        delete_scopes = {
            k: delete_scope
            for k, (_, _, delete_scope) in remove_metapartitions.items()
        }
    else:
        delete_scopes = {}

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
                _multiplex_prepare_data_for_ktk,
                cube=cube,
                existing_payload=existing_payload,
                partition_on=partition_on,
            ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
        delete_scopes=delete_scopes,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 10
0
def extend_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: KeyValueStore,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that extends a cube by the data supplied from a dask bag.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)
    check_datasets_preextend(ktk_cube_dataset_ids, cube)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    if overwrite:
        existing_datasets_cut = {
            ktk_cube_dataset_id: ds
            for ktk_cube_dataset_id, ds in existing_datasets.items()
            if ktk_cube_dataset_id not in ktk_cube_dataset_ids
        }
    else:
        existing_datasets_cut = existing_datasets
    existing_payload = get_cube_payload(existing_datasets_cut, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=existing_payload,
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 11
0
def get_frequent_ngrams(analyzed_sentences: dbag.Bag,
                        max_ngram_length: int,
                        min_ngram_support: int,
                        min_ngram_support_per_partition: int,
                        ngram_sample_rate: float,
                        token_field: str = "tokens",
                        ngram_field: str = "ngrams") -> dbag.Bag:
    """
  Adds a new field containing a list of all mined n-grams.  N-grams are tuples
  of strings such that at least one string is not a stopword.  Strings are
  collected from the lemmas of sentences.  To be counted, an ngram must occur
  in at least `min_ngram_support` sentences.
  """
    def part_to_ngram_counts(
            records: Iterable[Record]) -> Iterable[Dict[Tuple[str], int]]:
        ngram2count = {}
        for rec in records:

            def interesting(idx):
                t = rec[token_field][idx]
                return not t["stop"] and t["pos"] in INTERESTING_POS_TAGS

            # beginning of ngram
            for start_tok_idx in range(len(rec[token_field])):
                # ngrams must begin with an interesting word
                if not interesting(start_tok_idx):
                    continue
                # for each potential n-gram size
                for ngram_len in range(2, max_ngram_length):
                    end_tok_idx = start_tok_idx + ngram_len
                    # ngrams cannot extend beyond the sentence
                    if end_tok_idx > len(rec[token_field]):
                        continue
                    # ngrams must end with an interesting word
                    if not interesting(end_tok_idx - 1):
                        continue
                    # the ngram is an ordered tuple of lemmas
                    ngram = tuple(
                        rec[token_field][tok_idx]["lemma"]
                        for tok_idx in range(start_tok_idx, end_tok_idx))
                    if ngram in ngram2count:
                        ngram2count[ngram] += 1
                    else:
                        ngram2count[ngram] = 1
        # filter out all low-occurrence ngrams in this partition
        return [{
            n: c
            for n, c in ngram2count.items()
            if c >= min_ngram_support_per_partition
        }]

    def valid_ngrams(ngram2count: Dict[str, int]) -> Set[Tuple[str]]:
        ngrams = {n for n, c in ngram2count.items() if c >= min_ngram_support}
        return ngrams

    def parse_ngrams(record: Record, ngram_model: Set[Tuple[str]]):
        record[ngram_field] = []
        start_tok_idx = 0
        while start_tok_idx < len(record[token_field]):
            incr = 1  # amount to move start_tok_idx
            # from max -> 2. Match longest
            for ngram_len in range(max_ngram_length, 1, -1):
                # get bounds of ngram and make sure its within sentence
                end_tok_idx = start_tok_idx + ngram_len
                if end_tok_idx > len(record[token_field]):
                    continue
                ngram = tuple(record[token_field][tok_idx]["lemma"]
                              for tok_idx in range(start_tok_idx, end_tok_idx))
                # if match
                if ngram in ngram_model:
                    record[ngram_field].append("_".join(ngram))
                    # skip over matched terms
                    incr = ngram_len
                    break
            start_tok_idx += incr
        return record

    # Begin the actual function
    if max_ngram_length < 1:
        # disable, record empty field for all ngrams
        def init_nothing(rec: Record) -> Record:
            rec[ngram_field] = []
            return rec

        return analyzed_sentences.map(init_nothing)
    else:
        ngram2count = (analyzed_sentences.random_sample(
            ngram_sample_rate).map_partitions(part_to_ngram_counts).fold(
                misc_util.merge_counts, initial={}))
        ngram_model = delayed(valid_ngrams)(ngram2count)
        return analyzed_sentences.map(parse_ngrams, ngram_model=ngram_model)