def put_bag( bag: dbag.Bag, collection: str, indexed_field_name: Optional[str] = None, index_type: MONGO_INDEX = pymongo.HASHED, ) -> dbag.Bag: """ Writes all the records to collection. Sets index if specified. Returns a bag simply containing the number of written records, indented for use in the checkpointing system. """ def put_part_wrapper(*args, **kwargs): put(*args, **kwargs) return [True] if indexed_field_name is not None: print( f"\t- Setting index: {collection}.{indexed_field_name}:{index_type}" ) set_index(collection=collection, field_name=indexed_field_name, index_type=index_type) return dbag.from_delayed([ dask.delayed(put_part_wrapper)( records=part, collection=collection, ) for part in bag.to_delayed() ])
def save(bag:dbag.Bag, path:Path, keep_partial_result:bool=False)->dask.delayed: path.mkdir(parents=True, exist_ok=True) save_tasks = [] for part_idx, part in enumerate(bag.to_delayed()): part_path = path.joinpath(f"part-{part_idx}{EXT}") # if the partial result is not present, or we're not keeping partials if not part_path.is_file() or not keep_partial_result: save_tasks.append(dask.delayed(save_part)(part, part_path)) else: # introduces a no-op that keeps __done__ file correct save_tasks.append(dask.delayed(part_path)) return dask.delayed(write_done_file)(save_tasks, path)
def extract_entities_and_predicates_from_sentences( sentence_records: dbag.Bag, semrep_install_dir: Path, unicode_to_ascii_jar_path: Path, work_dir: Path, lexicon_year: int, mm_data_year: str, mm_data_version: str, ) -> dbag.Bag: """Runs each sentence through SemRep. Identifies Predicates and Entities Requires get_metamap_server_initializer added to dask_process_global. Args: sentence_records: Each record needs `id` and `sent_text`. work_dir: A directory visible to all workers where SemRep intermediate files will be stored. semrep_install_dir: The path where semrep was installed. Returns: One record per input sentence, where `id` of the new record matches the input. However, returned records will only have `entites` and `predicates` """ work_dir = Path(work_dir) assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}" semrep_input_dir = work_dir.joinpath("input_files") semrep_output_dir = work_dir.joinpath("output_files") semrep_input_dir.mkdir(exist_ok=True, parents=True) semrep_output_dir.mkdir(exist_ok=True, parents=True) semrep_tasks = [] for part_idx, partition in enumerate(sentence_records.to_delayed()): semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt") # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml") semrep_tasks.append(dask.delayed(_sentence_partition_to_records)( records=partition, unicode_to_ascii_jar_path=unicode_to_ascii_jar_path, input_path=semrep_input_path, semrep_install_dir=semrep_install_dir, lexicon_year=lexicon_year, mm_data_year=mm_data_year, mm_data_version=mm_data_version, )) return dbag.from_delayed(semrep_tasks)
def train_distributed_knn( hash_and_embedding: dbag.Bag, batch_size: int, num_centroids: int, num_probes: int, num_quantizers: int, bits_per_quantizer: int, training_sample_prob: float, shared_scratch_dir: Path, final_index_path: Path, id_field: str = "id", embedding_field: str = "embedding", ) -> Path: """ Computing all of the embeddings and then performing a KNN is a problem for memory. So, what we need to do instead is compute batches of embeddings, and use them in Faiss to reduce their dimensionality and process the appropriatly. I'm so sorry this one function has to do so much... @param hash_and_embedding: bag of hash value and embedding values @param text_field: input text field that we embed. @param id_field: output id field we use to store number hashes @param batch_size: number of sentences per batch @param num_centroids: number of voronoi cells in approx nn @param num_probes: number of cells to consider when querying @param num_quantizers: number of sub-vectors to discritize @param bits_per_quantizer: bits per sub-vector @param shared_scratch_dir: location to store intermediate results. @param training_sample_prob: chance a point is trained on @return The path you can load the resulting FAISS index """ init_index_path = shared_scratch_dir.joinpath("init.index") if not init_index_path.is_file(): print("\t- Constructing initial index:", init_index_path) # First off, we need to get a representative sample for faiss training training_data = hash_and_embedding.random_sample( prob=training_sample_prob).pluck(embedding_field) # Train initial index, store result in init_index_path init_index_path = dask.compute( dask.delayed(train_initial_index)( training_data=training_data, num_centroids=num_centroids, num_probes=num_probes, num_quantizers=num_quantizers, bits_per_quantizer=bits_per_quantizer, output_path=init_index_path, )) else: print("\t- Using initial index:", init_index_path) # For each partition, load embeddings to idx partial_idx_paths = [] for part_idx, part in enumerate(hash_and_embedding.to_delayed()): part_path = shared_scratch_dir.joinpath(f"part-{part_idx}.index") if part_path.is_file(): # rudimentary ckpt partial_idx_paths.append(dask.delayed(part_path)) else: partial_idx_paths.append( dask.delayed(add_points_to_index)( records=part, init_index_path=init_index_path, output_path=part_path, batch_size=batch_size, )) return dask.delayed(merge_index)( init_index_path=init_index_path, partial_idx_paths=partial_idx_paths, final_index_path=final_index_path, )