def export_key_value_records( key_value_records: dbag.Bag, export_dir: Path, ) -> None: """Converts a Dask bag of Dicts into a collection of json files. In order to create a lookup table, we must first export all data as json. This function maps each element of the input bag to a json encoded string and writes one file per partition to the export_dir. WARNING: this function will delete any json files already present in export_dir. Args: key_value_records: A dask bag containing dicts. export_dir: The location to write json files. Will erase any if present beforehand. """ export_dir = Path(export_dir) # Clean up / setup export dir export_dir.mkdir(parents=True, exist_ok=True) # Remove any previously constructed json files in there for json_file in export_dir.glob("*.json"): json_file.unlink() (key_value_records.map(_record_to_kv_json).to_textfiles( f"{export_dir}/*.json"))
def fetch_page_ids( bucket_name: str = S3_CANONICAL_DATA_BUCKET, source: str = "issues", issue_bag: db.Bag = None, n_partitions: int = 100, ) -> db.Bag: valid_sources = ["issues", "pages"] assert source in valid_sources if issue_bag is None: issue_bag = fetch_issues(bucket_name, compute=False).filter(lambda i: len(i) > 0) if source == "issues": print(f"Fetching page IDs from {source}") # no need to recompute the issues if issue_bag: pass else: issue_bag = fetch_issues(compute=False) return issue_bag.map(lambda i: i["pp"]).flatten() else: page_files = list_pages(bucket_name) return (db.from_sequence(page_files, npartitions=n_partitions).map( alternative_read_text, IMPRESSO_STORAGEOPT).flatten().map( json.loads).filter(lambda i: len(i) > 0).pluck("id"))
def create_lookup_table( record_bag: dbag.Bag, key_field: str, value_field: str, database_path: Path, intermediate_data_dir: Path, agatha_install_path: Path, ) -> None: database_path = Path(database_path) intermediate_data_dir = Path(intermediate_data_dir) agatha_install_path = Path(agatha_install_path) if not database_path.is_file(): if not intermediate_data_dir.exists(): intermediate_data_dir.mkdir(parents=True, exist_ok=True) else: # Remove any previously constructed json files in there print("\t- Removing existing json files from", intermediate_data_dir) for json_file in intermediate_data_dir.glob("*.json"): json_file.unlink() print("\t- Writing intermediate json files") ( # Save all keys and values as kv pair json files record_bag.map(_record_to_kv_json, key_field=key_field, value_field=value_field).to_textfiles( f"{intermediate_data_dir}/*.json")) print("\t- Writing", database_path) _make_sqlite3_database_from_json( intermediate_data_dir=intermediate_data_dir, database_path=database_path, agatha_install_path=agatha_install_path)
def check_duplicated_content_item_IDs(issue_bag: bag.Bag) -> pd.DataFrame: """Short summary. ..note:: This is a global check. :param bag.Bag issue_bag: Description of parameter `issue_bag`. :return: Description of returned object. :rtype: pd.DataFrame """ duplicates = (issue_bag.map( lambda issue_json: [ci["m"]["id"] for ci in issue_json["i"]]).flatten( ).frequencies().filter(lambda i: i[1] > 1).map( lambda i: { "ci_id": i[0], "freq": i[1], "newspaper_id": i[0].split("-")[0] }).compute()) if duplicates: duplicates_df = pd.DataFrame(duplicates).set_index("ci_id") else: # there are no duplicates duplicates_df = pd.DataFrame(columns=["ci_id", "freq", "newspaper_id"]) print((f"Found {duplicates_df.shape[0]} duplicated " "content item IDs, belonging to " f"{duplicates_df.newspaper_id.unique().size} journals" f"({', '.join(list(duplicates_df.newspaper_id.unique()))})")) return duplicates_df
def _run_apply(algorithms: Iterable[Algorithm], dataset: Bag) -> Tuple: algorithms = list(algorithms) def fold(*args): return tuple(left + right for left, right in zip(*args)) reduced = (dataset.map(lambda data: tuple( alg.apply(data) for alg in algorithms)).fold(fold).compute()) assert len(reduced) == len(algorithms) return tuple(reduced)
def _store_bag_as_dataset_parallel( bag: db.Bag, store: KeyValueStore, cube: Cube, ktk_cube_dataset_ids: Iterable[str], metadata: Optional[Dict[str, Dict[str, Any]]], existing_datasets, overwrite: bool = False, update: bool = False, delete_scopes=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to store datasets in parallel (e.g. from a dict). `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`). """ if (not update) and (not overwrite): for ktk_cube_dataset_id in ktk_cube_dataset_ids: raise_if_dataset_exists( dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), store=store) mps = bag.map(_multiplex_parse_input_to_metapartition) # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not # required here anymore mps = mps.map(_multiplex_store, store=store, cube=cube, df_serializer=df_serializer) aggregate = partial( _multiplex_store_dataset_from_partitions_flat, cube=cube, existing_datasets=existing_datasets, metadata=metadata, store=store, update=update, delete_scopes=delete_scopes or {}, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False, out_type=db.Bag)
def record_to_bipartite_edges( records: dbag.Bag, get_neighbor_keys_fn: Callable[[Record], Iterable[str]], get_source_key_fn: Callable[[Record], str] = lambda x: x["id"], bidirectional: bool = True, ) -> dbag.Bag: """ This function is responsible for extracting edges from records. For example, if you had a bag of records, each containing a set of terms, you might want to get the set of edges between records and terms. Args: records: The collection of records we wish to extract edges from. get_neighbor_keys_fn: Given a record, return a list of graph keys that are adjacent to the given record get_source_key_fn: Given a record, return a graph key that uniquely identifies the root. By default we get the "id" field bidirectional: If true, we write record->neighbor and neighbor->record. If false, we only write record->neighbor. Returns: A bag containing serialized key-value pairs that can be used to create an Sqlite3LookupTable """ def _to_kv(recs: Iterable[Record]) -> List[str]: "id, neighs to key_value strings" # Create graph, remove duplicate edges graph = defaultdict(set) for r in recs: id_ = r["id"] for neigh in r["neighs"]: graph[id_].add(neigh) if bidirectional: graph[neigh].add(id_) # Output edges res = [] for source, targets in graph.items(): for target in targets: res.append(json.dumps(dict(key=source, value=target))) return res return (records.map(lambda r: { "id": get_source_key_fn(r), "neighs": get_neighbor_keys_fn(r) }).map_partitions(_to_kv))
def build_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that builds a cube with the data supplied from a dask bag. Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the seed dataset will be written. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids is None: ktk_cube_dataset_ids = [cube.seed_dataset] else: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) cube = ensure_valid_cube_indices(existing_datasets, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=set(), partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def append_to_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], remove_conditions=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Append data to existing cube. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. remove_conditions: Conditions that select which partitions to remove. df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload: Set[str] = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) if remove_conditions is not None: remove_metapartitions = prepare_metapartitions_for_removal_action( cube, store, remove_conditions, ktk_cube_dataset_ids, existing_datasets) delete_scopes = { k: delete_scope for k, (_, _, delete_scope) in remove_metapartitions.items() } else: delete_scopes = {} data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, delete_scopes=delete_scopes, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def extend_cube_from_bag_internal( data: db.Bag, cube: Cube, store: KeyValueStore, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that extends a cube by the data supplied from a dask bag. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. Parameters ---------- data: dask.bag.Bag Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types). cube: kartothek.core.cube.cube.Cube Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) check_datasets_preextend(ktk_cube_dataset_ids, cube) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in ktk_cube_dataset_ids } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def get_frequent_ngrams(analyzed_sentences: dbag.Bag, max_ngram_length: int, min_ngram_support: int, min_ngram_support_per_partition: int, ngram_sample_rate: float, token_field: str = "tokens", ngram_field: str = "ngrams") -> dbag.Bag: """ Adds a new field containing a list of all mined n-grams. N-grams are tuples of strings such that at least one string is not a stopword. Strings are collected from the lemmas of sentences. To be counted, an ngram must occur in at least `min_ngram_support` sentences. """ def part_to_ngram_counts( records: Iterable[Record]) -> Iterable[Dict[Tuple[str], int]]: ngram2count = {} for rec in records: def interesting(idx): t = rec[token_field][idx] return not t["stop"] and t["pos"] in INTERESTING_POS_TAGS # beginning of ngram for start_tok_idx in range(len(rec[token_field])): # ngrams must begin with an interesting word if not interesting(start_tok_idx): continue # for each potential n-gram size for ngram_len in range(2, max_ngram_length): end_tok_idx = start_tok_idx + ngram_len # ngrams cannot extend beyond the sentence if end_tok_idx > len(rec[token_field]): continue # ngrams must end with an interesting word if not interesting(end_tok_idx - 1): continue # the ngram is an ordered tuple of lemmas ngram = tuple( rec[token_field][tok_idx]["lemma"] for tok_idx in range(start_tok_idx, end_tok_idx)) if ngram in ngram2count: ngram2count[ngram] += 1 else: ngram2count[ngram] = 1 # filter out all low-occurrence ngrams in this partition return [{ n: c for n, c in ngram2count.items() if c >= min_ngram_support_per_partition }] def valid_ngrams(ngram2count: Dict[str, int]) -> Set[Tuple[str]]: ngrams = {n for n, c in ngram2count.items() if c >= min_ngram_support} return ngrams def parse_ngrams(record: Record, ngram_model: Set[Tuple[str]]): record[ngram_field] = [] start_tok_idx = 0 while start_tok_idx < len(record[token_field]): incr = 1 # amount to move start_tok_idx # from max -> 2. Match longest for ngram_len in range(max_ngram_length, 1, -1): # get bounds of ngram and make sure its within sentence end_tok_idx = start_tok_idx + ngram_len if end_tok_idx > len(record[token_field]): continue ngram = tuple(record[token_field][tok_idx]["lemma"] for tok_idx in range(start_tok_idx, end_tok_idx)) # if match if ngram in ngram_model: record[ngram_field].append("_".join(ngram)) # skip over matched terms incr = ngram_len break start_tok_idx += incr return record # Begin the actual function if max_ngram_length < 1: # disable, record empty field for all ngrams def init_nothing(rec: Record) -> Record: rec[ngram_field] = [] return rec return analyzed_sentences.map(init_nothing) else: ngram2count = (analyzed_sentences.random_sample( ngram_sample_rate).map_partitions(part_to_ngram_counts).fold( misc_util.merge_counts, initial={})) ngram_model = delayed(valid_ngrams)(ngram2count) return analyzed_sentences.map(parse_ngrams, ngram_model=ngram_model)