def create_lookup_table( record_bag: dbag.Bag, key_field: str, value_field: str, database_path: Path, intermediate_data_dir: Path, agatha_install_path: Path, ) -> None: database_path = Path(database_path) intermediate_data_dir = Path(intermediate_data_dir) agatha_install_path = Path(agatha_install_path) if not database_path.is_file(): if not intermediate_data_dir.exists(): intermediate_data_dir.mkdir(parents=True, exist_ok=True) else: # Remove any previously constructed json files in there print("\t- Removing existing json files from", intermediate_data_dir) for json_file in intermediate_data_dir.glob("*.json"): json_file.unlink() print("\t- Writing intermediate json files") ( # Save all keys and values as kv pair json files record_bag.map(_record_to_kv_json, key_field=key_field, value_field=value_field).to_textfiles( f"{intermediate_data_dir}/*.json")) print("\t- Writing", database_path) _make_sqlite3_database_from_json( intermediate_data_dir=intermediate_data_dir, database_path=database_path, agatha_install_path=agatha_install_path)
def fetch_page_ids( bucket_name: str = S3_CANONICAL_DATA_BUCKET, source: str = "issues", issue_bag: db.Bag = None, n_partitions: int = 100, ) -> db.Bag: valid_sources = ["issues", "pages"] assert source in valid_sources if issue_bag is None: issue_bag = fetch_issues(bucket_name, compute=False).filter(lambda i: len(i) > 0) if source == "issues": print(f"Fetching page IDs from {source}") # no need to recompute the issues if issue_bag: pass else: issue_bag = fetch_issues(compute=False) return issue_bag.map(lambda i: i["pp"]).flatten() else: page_files = list_pages(bucket_name) return (db.from_sequence(page_files, npartitions=n_partitions).map( alternative_read_text, IMPRESSO_STORAGEOPT).flatten().map( json.loads).filter(lambda i: len(i) > 0).pluck("id"))
def check_duplicated_content_item_IDs(issue_bag: bag.Bag) -> pd.DataFrame: """Short summary. ..note:: This is a global check. :param bag.Bag issue_bag: Description of parameter `issue_bag`. :return: Description of returned object. :rtype: pd.DataFrame """ duplicates = (issue_bag.map( lambda issue_json: [ci["m"]["id"] for ci in issue_json["i"]]).flatten( ).frequencies().filter(lambda i: i[1] > 1).map( lambda i: { "ci_id": i[0], "freq": i[1], "newspaper_id": i[0].split("-")[0] }).compute()) if duplicates: duplicates_df = pd.DataFrame(duplicates).set_index("ci_id") else: # there are no duplicates duplicates_df = pd.DataFrame(columns=["ci_id", "freq", "newspaper_id"]) print((f"Found {duplicates_df.shape[0]} duplicated " "content item IDs, belonging to " f"{duplicates_df.newspaper_id.unique().size} journals" f"({', '.join(list(duplicates_df.newspaper_id.unique()))})")) return duplicates_df
def export_key_value_records( key_value_records: dbag.Bag, export_dir: Path, ) -> None: """Converts a Dask bag of Dicts into a collection of json files. In order to create a lookup table, we must first export all data as json. This function maps each element of the input bag to a json encoded string and writes one file per partition to the export_dir. WARNING: this function will delete any json files already present in export_dir. Args: key_value_records: A dask bag containing dicts. export_dir: The location to write json files. Will erase any if present beforehand. """ export_dir = Path(export_dir) # Clean up / setup export dir export_dir.mkdir(parents=True, exist_ok=True) # Remove any previously constructed json files in there for json_file in export_dir.glob("*.json"): json_file.unlink() (key_value_records.map(_record_to_kv_json).to_textfiles( f"{export_dir}/*.json"))
def put_bag( bag: dbag.Bag, collection: str, indexed_field_name: Optional[str] = None, index_type: MONGO_INDEX = pymongo.HASHED, ) -> dbag.Bag: """ Writes all the records to collection. Sets index if specified. Returns a bag simply containing the number of written records, indented for use in the checkpointing system. """ def put_part_wrapper(*args, **kwargs): put(*args, **kwargs) return [True] if indexed_field_name is not None: print( f"\t- Setting index: {collection}.{indexed_field_name}:{index_type}" ) set_index(collection=collection, field_name=indexed_field_name, index_type=index_type) return dbag.from_delayed([ dask.delayed(put_part_wrapper)( records=part, collection=collection, ) for part in bag.to_delayed() ])
def _multiplex_store( data: db.Bag, cube: Cube, store: StoreFactory, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: result = {} for k in sorted(data.keys()): v = data.pop(k) result[k] = MetaPartition.store_dataframes( v, dataset_uuid=cube.ktk_dataset_uuid(k), df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER, store=store, ) del v return result
def remove_deleted_tweets(data: db.Bag) -> db.Bag: """ Function to remove unneeded tweets Deleted tweets don't include various parameters, including the `lang` parameter :param data: dask bags that contain the tweets :return: returns the items that haven't been deleted """ return data.filter(lambda x: 'lang' in x)
def nearest_neighbors_network_from_index( hash_and_embedding:dbag.Bag, inverted_index_collection:str, batch_size:int, num_neighbors:int, faiss_index_name="final", weight:float=1.0, )->Iterable[nx.Graph]: """ Applies faiss and runs results through inverted index. Requires knn_util:faiss_index and knn_util:inverted_index to be initialized. """ def apply_faiss_to_edges( hash_and_embedding:Iterable[Record], )->Iterable[nx.Graph]: # The only reason we need parts_written_to_db is to make sure that the # writing happens before this point index = dpg.get(f"knn_util:faiss_{faiss_index_name}") inverted_index = {} graph = nx.Graph() for batch in iter_to_batches(hash_and_embedding, batch_size): hashes, embeddings = records_to_ids_and_embeddings( records=batch, ) _, neighs_per_root = index.search(embeddings, num_neighbors) hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist()) hashes = list(set(hashes) - set(inverted_index.keys())) graph_keys = database_util.get( values=hashes, collection=inverted_index_collection, field_name="hash", desired_fields=["strid"] ) for k, v in zip(hashes, graph_keys): inverted_index[k] = v["strid"] # Create records for root_idx, neigh_indices in zip(hashes, neighs_per_root): root = inverted_index[root_idx] if root is None: continue for neigh_idx in neigh_indices: if neigh_idx == root_idx: continue neigh = inverted_index[neigh_idx] if neigh is None: continue graph.add_edge(root, neigh, weight=weight) graph.add_edge(neigh, root, weight=weight) return [graph] return hash_and_embedding.map_partitions(apply_faiss_to_edges)
def _run_apply(algorithms: Iterable[Algorithm], dataset: Bag) -> Tuple: algorithms = list(algorithms) def fold(*args): return tuple(left + right for left, right in zip(*args)) reduced = (dataset.map(lambda data: tuple( alg.apply(data) for alg in algorithms)).fold(fold).compute()) assert len(reduced) == len(algorithms) return tuple(reduced)
def _(data: Bag, fr: float, to: float, bins: int) -> Tuple[ndarray, ndarray]: # @jit(nopython=True, nogil=True) # todo: jit this function def inc(values: Iterable[float]) -> ndarray: binned = digitize(values, linspace(fr, to, bins + 1)) init = zeros(bins + 2, dtype=uint64) for i in binned: init[i] += 1 return init hist = data.reduction(inc, sum) return hist, linspace(fr, to, bins + 1)
def save(bag:dbag.Bag, path:Path, keep_partial_result:bool=False)->dask.delayed: path.mkdir(parents=True, exist_ok=True) save_tasks = [] for part_idx, part in enumerate(bag.to_delayed()): part_path = path.joinpath(f"part-{part_idx}{EXT}") # if the partial result is not present, or we're not keeping partials if not part_path.is_file() or not keep_partial_result: save_tasks.append(dask.delayed(save_part)(part, part_path)) else: # introduces a no-op that keeps __done__ file correct save_tasks.append(dask.delayed(part_path)) return dask.delayed(write_done_file)(save_tasks, path)
def check_duplicated_issues_IDs(issue_bag: bag.Bag) -> pd.DataFrame: """Check that newspaper issue IDs are unique within the corpus.""" duplicate_issue_ids = ( issue_bag.pluck("id").frequencies().filter(lambda i: i[1] > 1).map( lambda i: { "issue_id": i[0], "freq": i[1], "newspaper_id": i[0].split("-")[0], }).compute()) print(f"{len(duplicate_issue_ids)} duplicated IDs were found") return pd.DataFrame(duplicate_issue_ids).set_index("issue_id")
def _(data: Bag, xfr: float, xto: float, xbins: int, yfr: float, yto: float, ybins: int) -> Tuple[ndarray, ndarray, ndarray]: # @jit(nopython=True, nogil=True) # todo: jit this function def inc(values: Iterable[Tuple[float, float]]) -> ndarray: xvalues, yvalues = column_stack(values) xbinned = digitize(xvalues, linspace(xfr, xto, xbins + 1)) ybinned = digitize(yvalues, linspace(yfr, yto, ybins + 1)) init = zeros((xbins + 2, ybins + 2), dtype=uint64) for x, y in zip(xbinned, ybinned): init[x, y] += 1 return init hist = data.reduction(inc, sum) return hist, linspace(xfr, xto, xbins + 1), linspace(yfr, yto, ybins + 1)
def _store_bag_as_dataset_parallel( bag: db.Bag, store: KeyValueStore, cube: Cube, ktk_cube_dataset_ids: Iterable[str], metadata: Optional[Dict[str, Dict[str, Any]]], existing_datasets, overwrite: bool = False, update: bool = False, delete_scopes=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to store datasets in parallel (e.g. from a dict). `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`). """ if (not update) and (not overwrite): for ktk_cube_dataset_id in ktk_cube_dataset_ids: raise_if_dataset_exists( dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), store=store) mps = bag.map(_multiplex_parse_input_to_metapartition) # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not # required here anymore mps = mps.map(_multiplex_store, store=store, cube=cube, df_serializer=df_serializer) aggregate = partial( _multiplex_store_dataset_from_partitions_flat, cube=cube, existing_datasets=existing_datasets, metadata=metadata, store=store, update=update, delete_scopes=delete_scopes or {}, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False, out_type=db.Bag)
def to_training_database(bag:dbag.Bag, database_dir:Path): assert database_dir.is_dir() done_file = database_dir.joinpath("__done__") if not done_file.is_file(): def part_to_db(records): r_name = "".join([random.choice(string.ascii_letters) for _ in range(10)]) db_path = database_dir.joinpath(r_name + ".sqlite") with SqliteDict(db_path, journal_mode="OFF", flag="n") as db: for idx, rec in enumerate(records): db[str(idx)] = rec db.commit() return db_path db_paths = bag.map_partitions(part_to_db).compute() with open(done_file, 'w') as f: for p in db_paths: f.write(f"{p}\n")
def extract_entities_and_predicates_from_sentences( sentence_records: dbag.Bag, semrep_install_dir: Path, unicode_to_ascii_jar_path: Path, work_dir: Path, lexicon_year: int, mm_data_year: str, mm_data_version: str, ) -> dbag.Bag: """Runs each sentence through SemRep. Identifies Predicates and Entities Requires get_metamap_server_initializer added to dask_process_global. Args: sentence_records: Each record needs `id` and `sent_text`. work_dir: A directory visible to all workers where SemRep intermediate files will be stored. semrep_install_dir: The path where semrep was installed. Returns: One record per input sentence, where `id` of the new record matches the input. However, returned records will only have `entites` and `predicates` """ work_dir = Path(work_dir) assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}" semrep_input_dir = work_dir.joinpath("input_files") semrep_output_dir = work_dir.joinpath("output_files") semrep_input_dir.mkdir(exist_ok=True, parents=True) semrep_output_dir.mkdir(exist_ok=True, parents=True) semrep_tasks = [] for part_idx, partition in enumerate(sentence_records.to_delayed()): semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt") # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml") semrep_tasks.append(dask.delayed(_sentence_partition_to_records)( records=partition, unicode_to_ascii_jar_path=unicode_to_ascii_jar_path, input_path=semrep_input_path, semrep_install_dir=semrep_install_dir, lexicon_year=lexicon_year, mm_data_year=mm_data_year, mm_data_version=mm_data_version, )) return dbag.from_delayed(semrep_tasks)
def record_to_bipartite_edges( records: dbag.Bag, get_neighbor_keys_fn: Callable[[Record], Iterable[str]], get_source_key_fn: Callable[[Record], str] = lambda x: x["id"], bidirectional: bool = True, ) -> dbag.Bag: """ This function is responsible for extracting edges from records. For example, if you had a bag of records, each containing a set of terms, you might want to get the set of edges between records and terms. Args: records: The collection of records we wish to extract edges from. get_neighbor_keys_fn: Given a record, return a list of graph keys that are adjacent to the given record get_source_key_fn: Given a record, return a graph key that uniquely identifies the root. By default we get the "id" field bidirectional: If true, we write record->neighbor and neighbor->record. If false, we only write record->neighbor. Returns: A bag containing serialized key-value pairs that can be used to create an Sqlite3LookupTable """ def _to_kv(recs: Iterable[Record]) -> List[str]: "id, neighs to key_value strings" # Create graph, remove duplicate edges graph = defaultdict(set) for r in recs: id_ = r["id"] for neigh in r["neighs"]: graph[id_].add(neigh) if bidirectional: graph[neigh].add(id_) # Output edges res = [] for source, targets in graph.items(): for target in targets: res.append(json.dumps(dict(key=source, value=target))) return res return (records.map(lambda r: { "id": get_source_key_fn(r), "neighs": get_neighbor_keys_fn(r) }).map_partitions(_to_kv))
def nearest_neighbors_network_from_index( hash_and_embedding: dbag.Bag, hash2name_db: Path, batch_size: int, num_neighbors: int, faiss_index_name="final", weight: float = 1.0, ) -> Iterable[nx.Graph]: """ Applies faiss and runs results through inverted index. """ assert hash2name_db.is_file(), "Missing hash2names sqlite3 db." def apply_faiss_to_edges( hash_and_embedding: Iterable[Record], ) -> Iterable[nx.Graph]: # The only reason we need parts_written_to_db is to make sure that the # writing happens before this point index = dpg.get(f"knn_util:faiss_{faiss_index_name}") graph = nx.Graph() with sqlite3_lookup.Sqlite3LookupTable(hash2name_db) as hash2names: for batch in iter_to_batches(hash_and_embedding, batch_size): hashes, embeddings = to_hash_and_embedding(records=batch) _, neighs_per_root = index.search(embeddings, num_neighbors) hashes = hashes.tolist() + flatten_list( neighs_per_root.tolist()) # Create records for root_hash, neigh_indices in zip(hashes, neighs_per_root): if root_hash in hash2names: for root_name in hash2names[root_hash]: for neigh_hash in neigh_indices: if neigh_hash != root_hash and neigh_hash in hash2names: for neigh_name in hash2names[neigh_hash]: graph.add_edge(root_name, neigh_name, weight=weight) graph.add_edge(neigh_name, root_name, weight=weight) return [graph] return hash_and_embedding.map_partitions(apply_faiss_to_edges)
def nearest_neighbors_network_from_index( hash_and_embedding: dbag.Bag, hash2name_db: Path, batch_size: int, num_neighbors: int, faiss_index_name="final", weight: float = 1.0, ) -> Iterable[str]: """ Applies faiss and runs results through inverted index. """ assert hash2name_db.is_file(), "Missing hash2names sqlite3 db." def _apply_faiss(hash_and_embedding: Iterable[Record], ) -> List[Record]: # The only reason we need parts_written_to_db is to make sure that the # writing happens before this point index = dpg.get(f"knn_util:faiss_{faiss_index_name}") hash2names = sqlite3_lookup.Sqlite3LookupTable(hash2name_db) # "id", "neighs" res = [] for batch in iter_to_batches(hash_and_embedding, batch_size): hashes, embeddings = to_hash_and_embedding(records=batch) _, neighs_per_root = index.search(embeddings, num_neighbors) hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist()) # Create records for root_hash, neigh_indices in zip(hashes, neighs_per_root): if root_hash in hash2names: for root_name in hash2names[root_hash]: val = {"id": root_name, "neighs": set()} for neigh_hash in neigh_indices: if neigh_hash != root_hash and neigh_hash in hash2names: for neigh_name in hash2names[neigh_hash]: val["neighs"].add(neigh_name) res.append(val) return res return graph_util.record_to_bipartite_edges( hash_and_embedding.map_partitions(_apply_faiss), bidirectional=True, get_neighbor_keys_fn=lambda r: r["neighs"])
def main(): args = parse_args() if args.dir and (args.fwd or args.rev): sys.exit("Can define both dir and file options. Please choose one") elif args.dir: files = glob(args.dir + "/*R1*.gz") files.sort() files = [(f, f.replace("_R1", "_R2")) for f in files] else: fwd_file = args.fwd rev_file = args.rev files = [(fwd_file, rev_file)] if args.parallel: from dask.bag import Bag b = Bag.from_sequence(files) list(b.map(lambda f, r: process_gRNA(f, r, args.wtlib))) else: print(files) for f, r in files: process_gRNA(f, r, args.wtlib)
def append_to_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], remove_conditions=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Append data to existing cube. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. remove_conditions: Conditions that select which partitions to remove. df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload: Set[str] = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) if remove_conditions is not None: remove_metapartitions = prepare_metapartitions_for_removal_action( cube, store, remove_conditions, ktk_cube_dataset_ids, existing_datasets) delete_scopes = { k: delete_scope for k, (_, _, delete_scope) in remove_metapartitions.items() } else: delete_scopes = {} data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, delete_scopes=delete_scopes, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def extend_cube_from_bag_internal( data: db.Bag, cube: Cube, store: KeyValueStore, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that extends a cube by the data supplied from a dask bag. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. Parameters ---------- data: dask.bag.Bag Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types). cube: kartothek.core.cube.cube.Cube Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) check_datasets_preextend(ktk_cube_dataset_ids, cube) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in ktk_cube_dataset_ids } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
from operator import methodcaller from odo import chunks, TextFile, odo from dask.bag import Bag from odo.utils import filetexts def inc(x): return x + 1 dsk = {('x', 0): (range, 5), ('x', 1): (range, 5), ('x', 2): (range, 5)} L = list(range(5)) * 3 b = Bag(dsk, 'x', 3) def test_convert_bag_to_list(): assert odo(b, list) == L def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = odo(logs, Bag) assert isinstance(b, Bag) assert 'a1.log' in str(b.dask.values()) assert (list(map(methodcaller('strip'), odo(b, list))) == list( map(methodcaller('strip'), odo(logs, list))))
def get_frequent_ngrams(analyzed_sentences: dbag.Bag, max_ngram_length: int, min_ngram_support: int, min_ngram_support_per_partition: int, ngram_sample_rate: float, token_field: str = "tokens", ngram_field: str = "ngrams") -> dbag.Bag: """ Adds a new field containing a list of all mined n-grams. N-grams are tuples of strings such that at least one string is not a stopword. Strings are collected from the lemmas of sentences. To be counted, an ngram must occur in at least `min_ngram_support` sentences. """ def part_to_ngram_counts( records: Iterable[Record]) -> Iterable[Dict[Tuple[str], int]]: ngram2count = {} for rec in records: def interesting(idx): t = rec[token_field][idx] return not t["stop"] and t["pos"] in INTERESTING_POS_TAGS # beginning of ngram for start_tok_idx in range(len(rec[token_field])): # ngrams must begin with an interesting word if not interesting(start_tok_idx): continue # for each potential n-gram size for ngram_len in range(2, max_ngram_length): end_tok_idx = start_tok_idx + ngram_len # ngrams cannot extend beyond the sentence if end_tok_idx > len(rec[token_field]): continue # ngrams must end with an interesting word if not interesting(end_tok_idx - 1): continue # the ngram is an ordered tuple of lemmas ngram = tuple( rec[token_field][tok_idx]["lemma"] for tok_idx in range(start_tok_idx, end_tok_idx)) if ngram in ngram2count: ngram2count[ngram] += 1 else: ngram2count[ngram] = 1 # filter out all low-occurrence ngrams in this partition return [{ n: c for n, c in ngram2count.items() if c >= min_ngram_support_per_partition }] def valid_ngrams(ngram2count: Dict[str, int]) -> Set[Tuple[str]]: ngrams = {n for n, c in ngram2count.items() if c >= min_ngram_support} return ngrams def parse_ngrams(record: Record, ngram_model: Set[Tuple[str]]): record[ngram_field] = [] start_tok_idx = 0 while start_tok_idx < len(record[token_field]): incr = 1 # amount to move start_tok_idx # from max -> 2. Match longest for ngram_len in range(max_ngram_length, 1, -1): # get bounds of ngram and make sure its within sentence end_tok_idx = start_tok_idx + ngram_len if end_tok_idx > len(record[token_field]): continue ngram = tuple(record[token_field][tok_idx]["lemma"] for tok_idx in range(start_tok_idx, end_tok_idx)) # if match if ngram in ngram_model: record[ngram_field].append("_".join(ngram)) # skip over matched terms incr = ngram_len break start_tok_idx += incr return record # Begin the actual function if max_ngram_length < 1: # disable, record empty field for all ngrams def init_nothing(rec: Record) -> Record: rec[ngram_field] = [] return rec return analyzed_sentences.map(init_nothing) else: ngram2count = (analyzed_sentences.random_sample( ngram_sample_rate).map_partitions(part_to_ngram_counts).fold( misc_util.merge_counts, initial={})) ngram_model = delayed(valid_ngrams)(ngram2count) return analyzed_sentences.map(parse_ngrams, ngram_model=ngram_model)
def train_distributed_knn( hash_and_embedding: dbag.Bag, batch_size: int, num_centroids: int, num_probes: int, num_quantizers: int, bits_per_quantizer: int, training_sample_prob: float, shared_scratch_dir: Path, final_index_path: Path, id_field: str = "id", embedding_field: str = "embedding", ) -> Path: """ Computing all of the embeddings and then performing a KNN is a problem for memory. So, what we need to do instead is compute batches of embeddings, and use them in Faiss to reduce their dimensionality and process the appropriatly. I'm so sorry this one function has to do so much... @param hash_and_embedding: bag of hash value and embedding values @param text_field: input text field that we embed. @param id_field: output id field we use to store number hashes @param batch_size: number of sentences per batch @param num_centroids: number of voronoi cells in approx nn @param num_probes: number of cells to consider when querying @param num_quantizers: number of sub-vectors to discritize @param bits_per_quantizer: bits per sub-vector @param shared_scratch_dir: location to store intermediate results. @param training_sample_prob: chance a point is trained on @return The path you can load the resulting FAISS index """ init_index_path = shared_scratch_dir.joinpath("init.index") if not init_index_path.is_file(): print("\t- Constructing initial index:", init_index_path) # First off, we need to get a representative sample for faiss training training_data = hash_and_embedding.random_sample( prob=training_sample_prob).pluck(embedding_field) # Train initial index, store result in init_index_path init_index_path = dask.compute( dask.delayed(train_initial_index)( training_data=training_data, num_centroids=num_centroids, num_probes=num_probes, num_quantizers=num_quantizers, bits_per_quantizer=bits_per_quantizer, output_path=init_index_path, )) else: print("\t- Using initial index:", init_index_path) # For each partition, load embeddings to idx partial_idx_paths = [] for part_idx, part in enumerate(hash_and_embedding.to_delayed()): part_path = shared_scratch_dir.joinpath(f"part-{part_idx}.index") if part_path.is_file(): # rudimentary ckpt partial_idx_paths.append(dask.delayed(part_path)) else: partial_idx_paths.append( dask.delayed(add_points_to_index)( records=part, init_index_path=init_index_path, output_path=part_path, batch_size=batch_size, )) return dask.delayed(merge_index)( init_index_path=init_index_path, partial_idx_paths=partial_idx_paths, final_index_path=final_index_path, )
def build_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that builds a cube with the data supplied from a dask bag. Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the seed dataset will be written. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids is None: ktk_cube_dataset_ids = [cube.seed_dataset] else: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) cube = ensure_valid_cube_indices(existing_datasets, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=set(), partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def bag_to_df(bag: db.Bag, index_key: str = "g_id") -> dd.DataFrame: return bag.to_dataframe( ) # .set_index(index_key) # TODO wait for official id
def perform_document_independent_tasks( config: cpb.ConstructConfig, documents: dbag.Bag, ckpt_prefix: str, semrep_work_dir: Optional[Path] = None, ) -> None: """Performs Tasks that don't require communication between documents Performs all of the document processing operations that are required to happen on each document separately. This is important to separate between different input textual features because this allows us to update/invalidate particular sets of checkpoints faster. Args: config: Constriction Configuration documents: Collection of texts to process ckpt_prefix: To stop collisions, and to improve caching, each call to this function should have a different prefix indicating the type of the corresponding documents. For instance, calling this with medline documents could get the `medline` prefix. semrep_work_dir: The location to store semrep intermediate files. Only used if semrep has been installed and configured. """ ckpt("documents", ckpt_prefix) # Split documents into sentences, filter out too-long and too-short sentences. sentences = documents.map_partitions( text_util.split_sentences, # -- min_sentence_len=config.parser.min_sentence_len, max_sentence_len=config.parser.max_sentence_len, ) ckpt("sentences", ckpt_prefix) # Get metadata terms from each sentence coded_term_edges = graph_util.record_to_bipartite_edges( records=sentences, get_neighbor_keys_fn=text_util.get_mesh_keys, ) ckpt("coded_term_edges", ckpt_prefix, textfile=True) # Make edges between each adj sentence adj_sent_edges = graph_util.record_to_bipartite_edges( records=sentences, get_neighbor_keys_fn=text_util.get_adjacent_sentences, # We can store only one side of the connection because each sentence will # get their own neighbors. Additionally, these should all have the same # sort of connections. bidirectional=False, ) ckpt("adj_sent_edges", ckpt_prefix, textfile=True) # Apply lemmatization and entity extraction to sentences parsed_sentences = sentences.map_partitions( text_util.analyze_sentences, # -- text_field="sent_text", ) ckpt("parsed_sentences", ckpt_prefix) # Get lemma edges lemma_edges = graph_util.record_to_bipartite_edges( records=parsed_sentences, get_neighbor_keys_fn=text_util.get_interesting_token_keys, ) ckpt("lemma_edges", ckpt_prefix, textfile=True) # Get entity edges entity_edges = graph_util.record_to_bipartite_edges( records=parsed_sentences, get_neighbor_keys_fn=text_util.get_entity_keys, ) ckpt("entity_edges", ckpt_prefix, textfile=True) # If we're running semrep if (config.semrep.HasField("semrep_install_dir") and config.semrep.HasField("metamap_install_dir") and semrep_work_dir is not None): prefixed_semrep_work_dir = semrep_work_dir.joinpath(ckpt_prefix) prefixed_semrep_work_dir.mkdir(parents=True, exist_ok=True) semrep_sentences = \ semrep_util.extract_entities_and_predicates_from_sentences( sentence_records=sentences, unicode_to_ascii_jar_path=config.semrep.unicode_to_ascii_jar_path, semrep_install_dir=config.semrep.semrep_install_dir, work_dir=prefixed_semrep_work_dir, lexicon_year=config.semrep.lexicon_year, mm_data_year=config.semrep.mm_data_year, mm_data_version=config.semrep.mm_data_version, ) ckpt("semrep_sentences", ckpt_prefix) # Embed each sentence embedded_sentences = ( sentences.map_partitions( embedding_util.embed_records, # -- batch_size=config.sys.batch_size, text_field="sent_text", max_sequence_length=config.parser.max_sequence_length, )) ckpt("embedded_sentences", ckpt_prefix) # hash each sentence id hashed_embeddings = ( embedded_sentences.map(lambda x: { "id": misc_util.hash_str_to_int(x["id"]), "embedding": x["embedding"] })) ckpt("hashed_embeddings", ckpt_prefix) hashed_names = ( sentences.map(lambda rec: { "name": rec["id"], "hash": misc_util.hash_str_to_int(rec["id"]), })) ckpt("hashed_names", ckpt_prefix)
def record_to_bipartite_edges( records:dbag.Bag, get_neighbor_keys_fn:Callable[[Record], List[str]], weight_by_tf_idf:bool=True, minimum_document_frequency:int=2, bidirectional:bool=True, default_weight_multiplier:float=1.0, get_source_key_fn:Callable[[Record], str]=lambda x:x["id"], )->dbag.Bag: """ This function is responsible for extracting edges from records. For example, if you had a bag of records, each containing a set of terms, you might want to get the set of edges between records and terms. @param records: The collection of records we wish to extract edges from. @param get_neighbor_keys_fn: Given a record, return a list of graph keys that are adjacent to the given record @param weight_by_tf_idf: If true, perform tf-idf weighting on edges. In this case, if t is a term, d is a document and C is a corpus, than we calculate 1/((times t occurs in d / log(size of d)) * (size of C / number of d with t)) @param minimum_document_frequency: only used if weight_by_tf_idf is true. Removes nodes among neighbors that don't occur frequently enough. @param bidirectional: If true, we write record->neighbor and neighbor->record. If false, we only write record->neighbor. @param default_weight_multiplier: All weights are multiplied by this. If we aren't calculating tf-idf, this is the value of every weight. @param get_source_key_fn: Given a record, return a graph key that uniquely identifies the root. By default we get the "id" field @return A collection of networkx subgraphs """ def to_id_term_freq_len(records): res = [] for record in records: id_ = get_source_key_fn(record) tfs = {} neighs = get_neighbor_keys_fn(record) for n in neighs: if n in tfs: tfs[n] += 1 else: tfs[n] = 1 res += [ (id_, term, freq, len(neighs)) for term, freq in tfs.items() ] # columns=id, term, freq, doc_len return res def to_partial_doc_freqs(records): t2df = {} for record in records: for t in set(get_neighbor_keys_fn(record)): if t in t2df: t2df[t] += 1 else: t2df[t] = 1 # columns=term, doc_freq return list(t2df.items()) def calculate_tf_idf_part(part, corpus_size): res = [] for row in part.itertuples(): tfidf = 1.0 / (( row.freq / log(row.doc_len+1) ) * ( log(float(corpus_size) / row.doc_freq) )) res.append([row.id, row.term, tfidf]) return pd.DataFrame(res, columns=["id", "term", "freq"]) def part_to_graph(id_term_freqs): graph = nx.Graph() for row in id_term_freqs: i, t, f = row[:3] f *= default_weight_multiplier graph.add_edge(i, t, weight=f) if bidirectional: graph.add_edge(t, i, weight=f) return [graph] # a list of (id, term, freq) term_df = ( records .map_partitions(to_id_term_freq_len) .to_dataframe( meta={ "id": str, "term": str, "freq": float, "doc_len": int, } ) ) if weight_by_tf_idf: # A list of (term, doc_freq) document_frequencies = ( records .map_partitions(to_partial_doc_freqs) .to_dataframe( meta={ "term": str, "doc_freq": int, } ) .groupby("term") .sum() ) # filter document_frequencies = document_frequencies[ document_frequencies["doc_freq"] >= minimum_document_frequency ] corpus_size = records.count() term_df = ( term_df .join(document_frequencies, how="inner", on="term") .map_partitions( calculate_tf_idf_part, corpus_size=corpus_size, meta={ "id": str, "term": str, "freq": float, } ) ) return term_df.to_bag().map_partitions(part_to_graph)