Exemplo n.º 1
0
def create_lookup_table(
    record_bag: dbag.Bag,
    key_field: str,
    value_field: str,
    database_path: Path,
    intermediate_data_dir: Path,
    agatha_install_path: Path,
) -> None:
    database_path = Path(database_path)
    intermediate_data_dir = Path(intermediate_data_dir)
    agatha_install_path = Path(agatha_install_path)
    if not database_path.is_file():
        if not intermediate_data_dir.exists():
            intermediate_data_dir.mkdir(parents=True, exist_ok=True)
        else:
            # Remove any previously constructed json files in there
            print("\t- Removing existing json files from",
                  intermediate_data_dir)
            for json_file in intermediate_data_dir.glob("*.json"):
                json_file.unlink()

        print("\t- Writing intermediate json files")
        (  # Save all keys and values as kv pair json files
            record_bag.map(_record_to_kv_json,
                           key_field=key_field,
                           value_field=value_field).to_textfiles(
                               f"{intermediate_data_dir}/*.json"))
        print("\t- Writing", database_path)
        _make_sqlite3_database_from_json(
            intermediate_data_dir=intermediate_data_dir,
            database_path=database_path,
            agatha_install_path=agatha_install_path)
Exemplo n.º 2
0
def fetch_page_ids(
    bucket_name: str = S3_CANONICAL_DATA_BUCKET,
    source: str = "issues",
    issue_bag: db.Bag = None,
    n_partitions: int = 100,
) -> db.Bag:

    valid_sources = ["issues", "pages"]
    assert source in valid_sources

    if issue_bag is None:
        issue_bag = fetch_issues(bucket_name,
                                 compute=False).filter(lambda i: len(i) > 0)

    if source == "issues":
        print(f"Fetching page IDs from {source}")
        # no need to recompute the issues
        if issue_bag:
            pass
        else:
            issue_bag = fetch_issues(compute=False)
        return issue_bag.map(lambda i: i["pp"]).flatten()
    else:
        page_files = list_pages(bucket_name)
        return (db.from_sequence(page_files, npartitions=n_partitions).map(
            alternative_read_text, IMPRESSO_STORAGEOPT).flatten().map(
                json.loads).filter(lambda i: len(i) > 0).pluck("id"))
Exemplo n.º 3
0
def check_duplicated_content_item_IDs(issue_bag: bag.Bag) -> pd.DataFrame:
    """Short summary.

    ..note::
        This is a global check.

    :param bag.Bag issue_bag: Description of parameter `issue_bag`.
    :return: Description of returned object.
    :rtype: pd.DataFrame

    """
    duplicates = (issue_bag.map(
        lambda issue_json: [ci["m"]["id"] for ci in issue_json["i"]]).flatten(
        ).frequencies().filter(lambda i: i[1] > 1).map(
            lambda i: {
                "ci_id": i[0],
                "freq": i[1],
                "newspaper_id": i[0].split("-")[0]
            }).compute())

    if duplicates:
        duplicates_df = pd.DataFrame(duplicates).set_index("ci_id")
    else:
        # there are no duplicates
        duplicates_df = pd.DataFrame(columns=["ci_id", "freq", "newspaper_id"])

    print((f"Found {duplicates_df.shape[0]} duplicated "
           "content item IDs, belonging to "
           f"{duplicates_df.newspaper_id.unique().size} journals"
           f"({', '.join(list(duplicates_df.newspaper_id.unique()))})"))
    return duplicates_df
Exemplo n.º 4
0
def export_key_value_records(
    key_value_records: dbag.Bag,
    export_dir: Path,
) -> None:
    """Converts a Dask bag of Dicts into a collection of json files.

  In order to create a lookup table, we must first export all data as json.
  This function maps each element of the input bag to a json encoded string and
  writes one file per partition to the export_dir. WARNING: this function will
  delete any json files already present in export_dir.

  Args:
    key_value_records: A dask bag containing dicts.
    export_dir: The location to write json files. Will erase any if present
      beforehand.

  """
    export_dir = Path(export_dir)
    # Clean up / setup export dir
    export_dir.mkdir(parents=True, exist_ok=True)
    # Remove any previously constructed json files in there
    for json_file in export_dir.glob("*.json"):
        json_file.unlink()
    (key_value_records.map(_record_to_kv_json).to_textfiles(
        f"{export_dir}/*.json"))
Exemplo n.º 5
0
def put_bag(
    bag: dbag.Bag,
    collection: str,
    indexed_field_name: Optional[str] = None,
    index_type: MONGO_INDEX = pymongo.HASHED,
) -> dbag.Bag:
    """
  Writes all the records to collection. Sets index if specified. Returns a bag
  simply containing the number of written records, indented for use in the
  checkpointing system.
  """
    def put_part_wrapper(*args, **kwargs):
        put(*args, **kwargs)
        return [True]

    if indexed_field_name is not None:
        print(
            f"\t- Setting index: {collection}.{indexed_field_name}:{index_type}"
        )
        set_index(collection=collection,
                  field_name=indexed_field_name,
                  index_type=index_type)
    return dbag.from_delayed([
        dask.delayed(put_part_wrapper)(
            records=part,
            collection=collection,
        ) for part in bag.to_delayed()
    ])
Exemplo n.º 6
0
def _multiplex_store(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    result = {}
    for k in sorted(data.keys()):
        v = data.pop(k)
        result[k] = MetaPartition.store_dataframes(
            v,
            dataset_uuid=cube.ktk_dataset_uuid(k),
            df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER,
            store=store,
        )
        del v
    return result
 def remove_deleted_tweets(data: db.Bag) -> db.Bag:
     """
     Function to remove unneeded tweets
     Deleted tweets don't include various parameters, including the `lang` parameter
     :param data: dask bags that contain the tweets
     :return: returns the items that haven't been deleted
     """
     return data.filter(lambda x: 'lang' in x)
Exemplo n.º 8
0
def nearest_neighbors_network_from_index(
    hash_and_embedding:dbag.Bag,
    inverted_index_collection:str,
    batch_size:int,
    num_neighbors:int,
    faiss_index_name="final",
    weight:float=1.0,
)->Iterable[nx.Graph]:
  """
  Applies faiss and runs results through inverted index. Requires
  knn_util:faiss_index and knn_util:inverted_index to be initialized.
  """
  def apply_faiss_to_edges(
      hash_and_embedding:Iterable[Record],
  )->Iterable[nx.Graph]:

    # The only reason we need parts_written_to_db is to make sure that the
    # writing happens before this point
    index = dpg.get(f"knn_util:faiss_{faiss_index_name}")
    inverted_index = {}

    graph = nx.Graph()
    for batch in iter_to_batches(hash_and_embedding, batch_size):
      hashes, embeddings = records_to_ids_and_embeddings(
          records=batch,
      )
      _, neighs_per_root = index.search(embeddings, num_neighbors)

      hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist())
      hashes = list(set(hashes) - set(inverted_index.keys()))

      graph_keys = database_util.get(
          values=hashes,
          collection=inverted_index_collection,
          field_name="hash",
          desired_fields=["strid"]
      )
      for k, v in zip(hashes, graph_keys):
        inverted_index[k] = v["strid"]

      # Create records
      for root_idx, neigh_indices in zip(hashes, neighs_per_root):
        root = inverted_index[root_idx]
        if root is None:
          continue
        for neigh_idx in neigh_indices:
          if neigh_idx == root_idx:
            continue
          neigh = inverted_index[neigh_idx]
          if neigh is None:
            continue
          graph.add_edge(root, neigh, weight=weight)
          graph.add_edge(neigh, root, weight=weight)
    return [graph]

  return hash_and_embedding.map_partitions(apply_faiss_to_edges)
Exemplo n.º 9
0
def _run_apply(algorithms: Iterable[Algorithm], dataset: Bag) -> Tuple:
    algorithms = list(algorithms)

    def fold(*args):
        return tuple(left + right for left, right in zip(*args))

    reduced = (dataset.map(lambda data: tuple(
        alg.apply(data) for alg in algorithms)).fold(fold).compute())
    assert len(reduced) == len(algorithms)
    return tuple(reduced)
Exemplo n.º 10
0
def _(data: Bag, fr: float, to: float, bins: int) -> Tuple[ndarray, ndarray]:
    # @jit(nopython=True, nogil=True)  # todo: jit this function
    def inc(values: Iterable[float]) -> ndarray:
        binned = digitize(values, linspace(fr, to, bins + 1))
        init = zeros(bins + 2, dtype=uint64)
        for i in binned:
            init[i] += 1
        return init

    hist = data.reduction(inc, sum)
    return hist, linspace(fr, to, bins + 1)
Exemplo n.º 11
0
def save(bag:dbag.Bag, path:Path, keep_partial_result:bool=False)->dask.delayed:
  path.mkdir(parents=True, exist_ok=True)
  save_tasks = []
  for part_idx, part in enumerate(bag.to_delayed()):
    part_path = path.joinpath(f"part-{part_idx}{EXT}")
    # if the partial result is not present, or we're not keeping partials
    if not part_path.is_file() or not keep_partial_result:
      save_tasks.append(dask.delayed(save_part)(part, part_path))
    else:
      # introduces a no-op that keeps __done__ file correct
      save_tasks.append(dask.delayed(part_path))
  return dask.delayed(write_done_file)(save_tasks, path)
Exemplo n.º 12
0
def check_duplicated_issues_IDs(issue_bag: bag.Bag) -> pd.DataFrame:
    """Check that newspaper issue IDs are unique within the corpus."""

    duplicate_issue_ids = (
        issue_bag.pluck("id").frequencies().filter(lambda i: i[1] > 1).map(
            lambda i: {
                "issue_id": i[0],
                "freq": i[1],
                "newspaper_id": i[0].split("-")[0],
            }).compute())
    print(f"{len(duplicate_issue_ids)} duplicated IDs were found")
    return pd.DataFrame(duplicate_issue_ids).set_index("issue_id")
Exemplo n.º 13
0
def _(data: Bag, xfr: float, xto: float, xbins: int, yfr: float, yto: float,
      ybins: int) -> Tuple[ndarray, ndarray, ndarray]:
    # @jit(nopython=True, nogil=True)  # todo: jit this function
    def inc(values: Iterable[Tuple[float, float]]) -> ndarray:
        xvalues, yvalues = column_stack(values)
        xbinned = digitize(xvalues, linspace(xfr, xto, xbins + 1))
        ybinned = digitize(yvalues, linspace(yfr, yto, ybins + 1))
        init = zeros((xbins + 2, ybins + 2), dtype=uint64)
        for x, y in zip(xbinned, ybinned):
            init[x, y] += 1
        return init

    hist = data.reduction(inc, sum)
    return hist, linspace(xfr, xto, xbins + 1), linspace(yfr, yto, ybins + 1)
Exemplo n.º 14
0
def _store_bag_as_dataset_parallel(
    bag: db.Bag,
    store: KeyValueStore,
    cube: Cube,
    ktk_cube_dataset_ids: Iterable[str],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    existing_datasets,
    overwrite: bool = False,
    update: bool = False,
    delete_scopes=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to
    store datasets in parallel (e.g. from a dict).

    `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset
    (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`).
    """
    if (not update) and (not overwrite):
        for ktk_cube_dataset_id in ktk_cube_dataset_ids:
            raise_if_dataset_exists(
                dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
                store=store)

    mps = bag.map(_multiplex_parse_input_to_metapartition)

    # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not
    # required here anymore

    mps = mps.map(_multiplex_store,
                  store=store,
                  cube=cube,
                  df_serializer=df_serializer)

    aggregate = partial(
        _multiplex_store_dataset_from_partitions_flat,
        cube=cube,
        existing_datasets=existing_datasets,
        metadata=metadata,
        store=store,
        update=update,
        delete_scopes=delete_scopes or {},
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False,
                         out_type=db.Bag)
Exemplo n.º 15
0
def to_training_database(bag:dbag.Bag, database_dir:Path):
  assert database_dir.is_dir()
  done_file = database_dir.joinpath("__done__")
  if not done_file.is_file():
    def part_to_db(records):
      r_name = "".join([random.choice(string.ascii_letters) for _ in range(10)])
      db_path = database_dir.joinpath(r_name + ".sqlite")
      with SqliteDict(db_path, journal_mode="OFF", flag="n") as db:
        for idx, rec in enumerate(records):
          db[str(idx)] = rec
        db.commit()
      return db_path
    db_paths = bag.map_partitions(part_to_db).compute()
    with open(done_file, 'w') as f:
      for p in db_paths:
        f.write(f"{p}\n")
Exemplo n.º 16
0
def extract_entities_and_predicates_from_sentences(
    sentence_records: dbag.Bag,
    semrep_install_dir: Path,
    unicode_to_ascii_jar_path: Path,
    work_dir: Path,
    lexicon_year: int,
    mm_data_year: str,
    mm_data_version: str,
) -> dbag.Bag:
  """Runs each sentence through SemRep. Identifies Predicates and Entities

  Requires get_metamap_server_initializer added to dask_process_global.

  Args:
    sentence_records: Each record needs `id` and `sent_text`.
    work_dir: A directory visible to all workers where SemRep intermediate files
      will be stored.
    semrep_install_dir: The path where semrep was installed.

  Returns:
    One record per input sentence, where `id` of the new record matches the
    input. However, returned records will only have `entites` and `predicates`

  """

  work_dir = Path(work_dir)
  assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}"
  semrep_input_dir = work_dir.joinpath("input_files")
  semrep_output_dir = work_dir.joinpath("output_files")
  semrep_input_dir.mkdir(exist_ok=True, parents=True)
  semrep_output_dir.mkdir(exist_ok=True, parents=True)

  semrep_tasks = []
  for part_idx, partition in enumerate(sentence_records.to_delayed()):
    semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt")
    # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml")
    semrep_tasks.append(dask.delayed(_sentence_partition_to_records)(
        records=partition,
        unicode_to_ascii_jar_path=unicode_to_ascii_jar_path,
        input_path=semrep_input_path,
        semrep_install_dir=semrep_install_dir,
        lexicon_year=lexicon_year,
        mm_data_year=mm_data_year,
        mm_data_version=mm_data_version,
    ))
  return dbag.from_delayed(semrep_tasks)
Exemplo n.º 17
0
def record_to_bipartite_edges(
    records: dbag.Bag,
    get_neighbor_keys_fn: Callable[[Record], Iterable[str]],
    get_source_key_fn: Callable[[Record], str] = lambda x: x["id"],
    bidirectional: bool = True,
) -> dbag.Bag:
    """
  This function is responsible for extracting edges from records. For example,
  if you had a bag of records, each containing a set of terms, you might want
  to get the set of edges between records and terms.

  Args:
    records: The collection of records we wish to extract edges from.
    get_neighbor_keys_fn: Given a record, return a list of graph keys that
      are adjacent to the given record
    get_source_key_fn: Given a record, return a graph key that uniquely
      identifies the root. By default we get the "id" field
    bidirectional: If true, we write record->neighbor and neighbor->record.
      If false, we only write record->neighbor.

  Returns:
    A bag containing serialized key-value pairs that can be used to create an
    Sqlite3LookupTable

  """
    def _to_kv(recs: Iterable[Record]) -> List[str]:
        "id, neighs to key_value strings"
        # Create graph, remove duplicate edges
        graph = defaultdict(set)
        for r in recs:
            id_ = r["id"]
            for neigh in r["neighs"]:
                graph[id_].add(neigh)
                if bidirectional:
                    graph[neigh].add(id_)
        # Output edges
        res = []
        for source, targets in graph.items():
            for target in targets:
                res.append(json.dumps(dict(key=source, value=target)))
        return res

    return (records.map(lambda r: {
        "id": get_source_key_fn(r),
        "neighs": get_neighbor_keys_fn(r)
    }).map_partitions(_to_kv))
Exemplo n.º 18
0
def nearest_neighbors_network_from_index(
    hash_and_embedding: dbag.Bag,
    hash2name_db: Path,
    batch_size: int,
    num_neighbors: int,
    faiss_index_name="final",
    weight: float = 1.0,
) -> Iterable[nx.Graph]:
    """
  Applies faiss and runs results through inverted index.
  """
    assert hash2name_db.is_file(), "Missing hash2names sqlite3 db."

    def apply_faiss_to_edges(
        hash_and_embedding: Iterable[Record], ) -> Iterable[nx.Graph]:

        # The only reason we need parts_written_to_db is to make sure that the
        # writing happens before this point
        index = dpg.get(f"knn_util:faiss_{faiss_index_name}")

        graph = nx.Graph()
        with sqlite3_lookup.Sqlite3LookupTable(hash2name_db) as hash2names:
            for batch in iter_to_batches(hash_and_embedding, batch_size):
                hashes, embeddings = to_hash_and_embedding(records=batch)
                _, neighs_per_root = index.search(embeddings, num_neighbors)
                hashes = hashes.tolist() + flatten_list(
                    neighs_per_root.tolist())
                # Create records
                for root_hash, neigh_indices in zip(hashes, neighs_per_root):
                    if root_hash in hash2names:
                        for root_name in hash2names[root_hash]:
                            for neigh_hash in neigh_indices:
                                if neigh_hash != root_hash and neigh_hash in hash2names:
                                    for neigh_name in hash2names[neigh_hash]:
                                        graph.add_edge(root_name,
                                                       neigh_name,
                                                       weight=weight)
                                        graph.add_edge(neigh_name,
                                                       root_name,
                                                       weight=weight)
        return [graph]

    return hash_and_embedding.map_partitions(apply_faiss_to_edges)
Exemplo n.º 19
0
def nearest_neighbors_network_from_index(
    hash_and_embedding: dbag.Bag,
    hash2name_db: Path,
    batch_size: int,
    num_neighbors: int,
    faiss_index_name="final",
    weight: float = 1.0,
) -> Iterable[str]:
    """
  Applies faiss and runs results through inverted index.
  """
    assert hash2name_db.is_file(), "Missing hash2names sqlite3 db."

    def _apply_faiss(hash_and_embedding: Iterable[Record], ) -> List[Record]:

        # The only reason we need parts_written_to_db is to make sure that the
        # writing happens before this point
        index = dpg.get(f"knn_util:faiss_{faiss_index_name}")
        hash2names = sqlite3_lookup.Sqlite3LookupTable(hash2name_db)

        # "id", "neighs"
        res = []
        for batch in iter_to_batches(hash_and_embedding, batch_size):
            hashes, embeddings = to_hash_and_embedding(records=batch)
            _, neighs_per_root = index.search(embeddings, num_neighbors)
            hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist())
            # Create records
            for root_hash, neigh_indices in zip(hashes, neighs_per_root):
                if root_hash in hash2names:
                    for root_name in hash2names[root_hash]:
                        val = {"id": root_name, "neighs": set()}
                        for neigh_hash in neigh_indices:
                            if neigh_hash != root_hash and neigh_hash in hash2names:
                                for neigh_name in hash2names[neigh_hash]:
                                    val["neighs"].add(neigh_name)
                        res.append(val)
        return res

    return graph_util.record_to_bipartite_edges(
        hash_and_embedding.map_partitions(_apply_faiss),
        bidirectional=True,
        get_neighbor_keys_fn=lambda r: r["neighs"])
Exemplo n.º 20
0
def main():
    args = parse_args()

    if args.dir and (args.fwd or args.rev):
        sys.exit("Can define both dir and file options.  Please choose one")

    elif args.dir:
        files = glob(args.dir + "/*R1*.gz")
        files.sort()
        files = [(f, f.replace("_R1", "_R2")) for f in files]
    else:
        fwd_file = args.fwd
        rev_file = args.rev
        files = [(fwd_file, rev_file)]

    if args.parallel:
        from dask.bag import Bag
        b = Bag.from_sequence(files)
        list(b.map(lambda f, r: process_gRNA(f, r, args.wtlib)))
    else:
        print(files)
        for f, r in files:
            process_gRNA(f, r, args.wtlib)
Exemplo n.º 21
0
def append_to_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    remove_conditions=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".


    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    remove_conditions:
        Conditions that select which partitions to remove.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload: Set[str] = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    if remove_conditions is not None:
        remove_metapartitions = prepare_metapartitions_for_removal_action(
            cube, store, remove_conditions, ktk_cube_dataset_ids,
            existing_datasets)
        delete_scopes = {
            k: delete_scope
            for k, (_, _, delete_scope) in remove_metapartitions.items()
        }
    else:
        delete_scopes = {}

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
                _multiplex_prepare_data_for_ktk,
                cube=cube,
                existing_payload=existing_payload,
                partition_on=partition_on,
            ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
        delete_scopes=delete_scopes,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 22
0
def extend_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: KeyValueStore,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that extends a cube by the data supplied from a dask bag.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)
    check_datasets_preextend(ktk_cube_dataset_ids, cube)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    if overwrite:
        existing_datasets_cut = {
            ktk_cube_dataset_id: ds
            for ktk_cube_dataset_id, ds in existing_datasets.items()
            if ktk_cube_dataset_id not in ktk_cube_dataset_ids
        }
    else:
        existing_datasets_cut = existing_datasets
    existing_payload = get_cube_payload(existing_datasets_cut, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=existing_payload,
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 23
0
from operator import methodcaller

from odo import chunks, TextFile, odo
from dask.bag import Bag
from odo.utils import filetexts


def inc(x):
    return x + 1


dsk = {('x', 0): (range, 5), ('x', 1): (range, 5), ('x', 2): (range, 5)}

L = list(range(5)) * 3

b = Bag(dsk, 'x', 3)


def test_convert_bag_to_list():
    assert odo(b, list) == L


def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = odo(logs, Bag)
        assert isinstance(b, Bag)
        assert 'a1.log' in str(b.dask.values())
        assert (list(map(methodcaller('strip'), odo(b, list))) == list(
            map(methodcaller('strip'), odo(logs, list))))
Exemplo n.º 24
0
def get_frequent_ngrams(analyzed_sentences: dbag.Bag,
                        max_ngram_length: int,
                        min_ngram_support: int,
                        min_ngram_support_per_partition: int,
                        ngram_sample_rate: float,
                        token_field: str = "tokens",
                        ngram_field: str = "ngrams") -> dbag.Bag:
    """
  Adds a new field containing a list of all mined n-grams.  N-grams are tuples
  of strings such that at least one string is not a stopword.  Strings are
  collected from the lemmas of sentences.  To be counted, an ngram must occur
  in at least `min_ngram_support` sentences.
  """
    def part_to_ngram_counts(
            records: Iterable[Record]) -> Iterable[Dict[Tuple[str], int]]:
        ngram2count = {}
        for rec in records:

            def interesting(idx):
                t = rec[token_field][idx]
                return not t["stop"] and t["pos"] in INTERESTING_POS_TAGS

            # beginning of ngram
            for start_tok_idx in range(len(rec[token_field])):
                # ngrams must begin with an interesting word
                if not interesting(start_tok_idx):
                    continue
                # for each potential n-gram size
                for ngram_len in range(2, max_ngram_length):
                    end_tok_idx = start_tok_idx + ngram_len
                    # ngrams cannot extend beyond the sentence
                    if end_tok_idx > len(rec[token_field]):
                        continue
                    # ngrams must end with an interesting word
                    if not interesting(end_tok_idx - 1):
                        continue
                    # the ngram is an ordered tuple of lemmas
                    ngram = tuple(
                        rec[token_field][tok_idx]["lemma"]
                        for tok_idx in range(start_tok_idx, end_tok_idx))
                    if ngram in ngram2count:
                        ngram2count[ngram] += 1
                    else:
                        ngram2count[ngram] = 1
        # filter out all low-occurrence ngrams in this partition
        return [{
            n: c
            for n, c in ngram2count.items()
            if c >= min_ngram_support_per_partition
        }]

    def valid_ngrams(ngram2count: Dict[str, int]) -> Set[Tuple[str]]:
        ngrams = {n for n, c in ngram2count.items() if c >= min_ngram_support}
        return ngrams

    def parse_ngrams(record: Record, ngram_model: Set[Tuple[str]]):
        record[ngram_field] = []
        start_tok_idx = 0
        while start_tok_idx < len(record[token_field]):
            incr = 1  # amount to move start_tok_idx
            # from max -> 2. Match longest
            for ngram_len in range(max_ngram_length, 1, -1):
                # get bounds of ngram and make sure its within sentence
                end_tok_idx = start_tok_idx + ngram_len
                if end_tok_idx > len(record[token_field]):
                    continue
                ngram = tuple(record[token_field][tok_idx]["lemma"]
                              for tok_idx in range(start_tok_idx, end_tok_idx))
                # if match
                if ngram in ngram_model:
                    record[ngram_field].append("_".join(ngram))
                    # skip over matched terms
                    incr = ngram_len
                    break
            start_tok_idx += incr
        return record

    # Begin the actual function
    if max_ngram_length < 1:
        # disable, record empty field for all ngrams
        def init_nothing(rec: Record) -> Record:
            rec[ngram_field] = []
            return rec

        return analyzed_sentences.map(init_nothing)
    else:
        ngram2count = (analyzed_sentences.random_sample(
            ngram_sample_rate).map_partitions(part_to_ngram_counts).fold(
                misc_util.merge_counts, initial={}))
        ngram_model = delayed(valid_ngrams)(ngram2count)
        return analyzed_sentences.map(parse_ngrams, ngram_model=ngram_model)
Exemplo n.º 25
0
def train_distributed_knn(
    hash_and_embedding: dbag.Bag,
    batch_size: int,
    num_centroids: int,
    num_probes: int,
    num_quantizers: int,
    bits_per_quantizer: int,
    training_sample_prob: float,
    shared_scratch_dir: Path,
    final_index_path: Path,
    id_field: str = "id",
    embedding_field: str = "embedding",
) -> Path:
    """
  Computing all of the embeddings and then performing a KNN is a problem for memory.
  So, what we need to do instead is compute batches of embeddings, and use them in Faiss
  to reduce their dimensionality and process the appropriatly.

  I'm so sorry this one function has to do so much...

  @param hash_and_embedding: bag of hash value and embedding values
  @param text_field: input text field that we embed.
  @param id_field: output id field we use to store number hashes
  @param batch_size: number of sentences per batch
  @param num_centroids: number of voronoi cells in approx nn
  @param num_probes: number of cells to consider when querying
  @param num_quantizers: number of sub-vectors to discritize
  @param bits_per_quantizer: bits per sub-vector
  @param shared_scratch_dir: location to store intermediate results.
  @param training_sample_prob: chance a point is trained on
  @return The path you can load the resulting FAISS index
  """
    init_index_path = shared_scratch_dir.joinpath("init.index")

    if not init_index_path.is_file():
        print("\t- Constructing initial index:", init_index_path)
        # First off, we need to get a representative sample for faiss training
        training_data = hash_and_embedding.random_sample(
            prob=training_sample_prob).pluck(embedding_field)

        # Train initial index, store result in init_index_path
        init_index_path = dask.compute(
            dask.delayed(train_initial_index)(
                training_data=training_data,
                num_centroids=num_centroids,
                num_probes=num_probes,
                num_quantizers=num_quantizers,
                bits_per_quantizer=bits_per_quantizer,
                output_path=init_index_path,
            ))
    else:
        print("\t- Using initial index:", init_index_path)

    # For each partition, load embeddings to idx
    partial_idx_paths = []
    for part_idx, part in enumerate(hash_and_embedding.to_delayed()):
        part_path = shared_scratch_dir.joinpath(f"part-{part_idx}.index")
        if part_path.is_file():  # rudimentary ckpt
            partial_idx_paths.append(dask.delayed(part_path))
        else:
            partial_idx_paths.append(
                dask.delayed(add_points_to_index)(
                    records=part,
                    init_index_path=init_index_path,
                    output_path=part_path,
                    batch_size=batch_size,
                ))

    return dask.delayed(merge_index)(
        init_index_path=init_index_path,
        partial_idx_paths=partial_idx_paths,
        final_index_path=final_index_path,
    )
Exemplo n.º 26
0
def build_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that builds a cube with the data supplied from a dask bag.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the
        seed dataset will be written.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)

    if ktk_cube_dataset_ids is None:
        ktk_cube_dataset_ids = [cube.seed_dataset]
    else:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)

    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)
    cube = ensure_valid_cube_indices(existing_datasets, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=set(),
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Exemplo n.º 27
0
def bag_to_df(bag: db.Bag, index_key: str = "g_id") -> dd.DataFrame:
    return bag.to_dataframe(
    )  # .set_index(index_key)  # TODO wait for official id
Exemplo n.º 28
0
def perform_document_independent_tasks(
    config: cpb.ConstructConfig,
    documents: dbag.Bag,
    ckpt_prefix: str,
    semrep_work_dir: Optional[Path] = None,
) -> None:
    """Performs Tasks that don't require communication between documents

  Performs all of the document processing operations that are required to
  happen on each document separately. This is important to separate between
  different input textual features because this allows us to update/invalidate
  particular sets of checkpoints faster.

  Args:
    config: Constriction Configuration
    documents: Collection of texts to process
    ckpt_prefix: To stop collisions, and to improve caching, each call to this
      function should have a different prefix indicating the type of the
      corresponding documents. For instance, calling this with medline documents
      could get the `medline` prefix.
    semrep_work_dir: The location to store semrep intermediate files. Only
      used if semrep has been installed and configured.

  """

    ckpt("documents", ckpt_prefix)

    # Split documents into sentences, filter out too-long and too-short sentences.
    sentences = documents.map_partitions(
        text_util.split_sentences,
        # --
        min_sentence_len=config.parser.min_sentence_len,
        max_sentence_len=config.parser.max_sentence_len,
    )
    ckpt("sentences", ckpt_prefix)

    # Get metadata terms from each sentence
    coded_term_edges = graph_util.record_to_bipartite_edges(
        records=sentences,
        get_neighbor_keys_fn=text_util.get_mesh_keys,
    )
    ckpt("coded_term_edges", ckpt_prefix, textfile=True)

    # Make edges between each adj sentence
    adj_sent_edges = graph_util.record_to_bipartite_edges(
        records=sentences,
        get_neighbor_keys_fn=text_util.get_adjacent_sentences,
        # We can store only one side of the connection because each sentence will
        # get their own neighbors. Additionally, these should all have the same
        # sort of connections.
        bidirectional=False,
    )
    ckpt("adj_sent_edges", ckpt_prefix, textfile=True)

    # Apply lemmatization and entity extraction to sentences
    parsed_sentences = sentences.map_partitions(
        text_util.analyze_sentences,
        # --
        text_field="sent_text",
    )
    ckpt("parsed_sentences", ckpt_prefix)

    # Get lemma edges
    lemma_edges = graph_util.record_to_bipartite_edges(
        records=parsed_sentences,
        get_neighbor_keys_fn=text_util.get_interesting_token_keys,
    )
    ckpt("lemma_edges", ckpt_prefix, textfile=True)

    # Get entity edges
    entity_edges = graph_util.record_to_bipartite_edges(
        records=parsed_sentences,
        get_neighbor_keys_fn=text_util.get_entity_keys,
    )
    ckpt("entity_edges", ckpt_prefix, textfile=True)

    # If we're running semrep
    if (config.semrep.HasField("semrep_install_dir")
            and config.semrep.HasField("metamap_install_dir")
            and semrep_work_dir is not None):
        prefixed_semrep_work_dir = semrep_work_dir.joinpath(ckpt_prefix)
        prefixed_semrep_work_dir.mkdir(parents=True, exist_ok=True)
        semrep_sentences = \
            semrep_util.extract_entities_and_predicates_from_sentences(
                sentence_records=sentences,
                unicode_to_ascii_jar_path=config.semrep.unicode_to_ascii_jar_path,
                semrep_install_dir=config.semrep.semrep_install_dir,
                work_dir=prefixed_semrep_work_dir,
                lexicon_year=config.semrep.lexicon_year,
                mm_data_year=config.semrep.mm_data_year,
                mm_data_version=config.semrep.mm_data_version,
            )
        ckpt("semrep_sentences", ckpt_prefix)

    # Embed each sentence
    embedded_sentences = (
        sentences.map_partitions(
            embedding_util.embed_records,
            # --
            batch_size=config.sys.batch_size,
            text_field="sent_text",
            max_sequence_length=config.parser.max_sequence_length,
        ))
    ckpt("embedded_sentences", ckpt_prefix)

    # hash each sentence id
    hashed_embeddings = (
        embedded_sentences.map(lambda x: {
            "id": misc_util.hash_str_to_int(x["id"]),
            "embedding": x["embedding"]
        }))
    ckpt("hashed_embeddings", ckpt_prefix)

    hashed_names = (
        sentences.map(lambda rec: {
            "name": rec["id"],
            "hash": misc_util.hash_str_to_int(rec["id"]),
        }))
    ckpt("hashed_names", ckpt_prefix)
Exemplo n.º 29
0
def record_to_bipartite_edges(
    records:dbag.Bag,
    get_neighbor_keys_fn:Callable[[Record], List[str]],
    weight_by_tf_idf:bool=True,
    minimum_document_frequency:int=2,
    bidirectional:bool=True,
    default_weight_multiplier:float=1.0,
    get_source_key_fn:Callable[[Record], str]=lambda x:x["id"],
)->dbag.Bag:
  """
  This function is responsible for extracting edges from records. For example,
  if you had a bag of records, each containing a set of terms, you might want
  to get the set of edges between records and terms.

  @param records: The collection of records we wish to extract edges from.
  @param get_neighbor_keys_fn: Given a record, return a list of graph keys that
  are adjacent to the given record
  @param weight_by_tf_idf: If true, perform tf-idf weighting on edges. In this
  case, if t is a term, d is a document and C is a corpus, than we calculate
  1/((times t occurs in d / log(size of d)) * (size of C / number of d with t))
  @param minimum_document_frequency: only used if weight_by_tf_idf is true.
  Removes nodes among neighbors that don't occur frequently enough.
  @param bidirectional: If true, we write record->neighbor and neighbor->record.
  If false, we only write record->neighbor.
  @param default_weight_multiplier: All weights are multiplied by this. If we
  aren't calculating tf-idf, this is the value of every weight.
  @param get_source_key_fn: Given a record, return a graph key that uniquely
  identifies the root. By default we get the "id" field
  @return A collection of networkx subgraphs
  """

  def to_id_term_freq_len(records):
    res = []
    for record in records:
      id_ = get_source_key_fn(record)
      tfs = {}
      neighs = get_neighbor_keys_fn(record)
      for n in neighs:
        if n in tfs:
          tfs[n] += 1
        else:
          tfs[n] = 1
      res += [
          (id_, term, freq, len(neighs))
          for term, freq in tfs.items()
      ]
    # columns=id, term, freq, doc_len
    return res

  def to_partial_doc_freqs(records):
    t2df = {}
    for record in records:
      for t in set(get_neighbor_keys_fn(record)):
        if t in t2df:
          t2df[t] += 1
        else:
          t2df[t] = 1
    # columns=term, doc_freq
    return list(t2df.items())

  def calculate_tf_idf_part(part, corpus_size):
    res = []
    for row in part.itertuples():
      tfidf = 1.0 / ((
          row.freq / log(row.doc_len+1)
            ) * (
          log(float(corpus_size) / row.doc_freq)
      ))
      res.append([row.id, row.term, tfidf])
    return pd.DataFrame(res, columns=["id", "term", "freq"])

  def part_to_graph(id_term_freqs):
    graph = nx.Graph()
    for row in id_term_freqs:
      i, t, f = row[:3]
      f *= default_weight_multiplier
      graph.add_edge(i, t, weight=f)
      if bidirectional:
        graph.add_edge(t, i, weight=f)
    return [graph]

  # a list of (id, term, freq)
  term_df = (
      records
      .map_partitions(to_id_term_freq_len)
      .to_dataframe(
        meta={
          "id": str,
          "term": str,
          "freq": float,
          "doc_len": int,
        }
      )
  )
  if weight_by_tf_idf:
    # A list of (term, doc_freq)
    document_frequencies = (
        records
        .map_partitions(to_partial_doc_freqs)
        .to_dataframe(
          meta={
            "term": str,
            "doc_freq": int,
          }
        )
        .groupby("term")
        .sum()
    )
    # filter
    document_frequencies = document_frequencies[
        document_frequencies["doc_freq"] >= minimum_document_frequency
    ]

    corpus_size = records.count()
    term_df = (
        term_df
        .join(document_frequencies, how="inner", on="term")
        .map_partitions(
          calculate_tf_idf_part,
          corpus_size=corpus_size,
          meta={
            "id": str,
            "term": str,
            "freq": float,
          }
        )
    )

  return term_df.to_bag().map_partitions(part_to_graph)