Exemplo n.º 1
0
def put_bag(
    bag: dbag.Bag,
    collection: str,
    indexed_field_name: Optional[str] = None,
    index_type: MONGO_INDEX = pymongo.HASHED,
) -> dbag.Bag:
    """
  Writes all the records to collection. Sets index if specified. Returns a bag
  simply containing the number of written records, indented for use in the
  checkpointing system.
  """
    def put_part_wrapper(*args, **kwargs):
        put(*args, **kwargs)
        return [True]

    if indexed_field_name is not None:
        print(
            f"\t- Setting index: {collection}.{indexed_field_name}:{index_type}"
        )
        set_index(collection=collection,
                  field_name=indexed_field_name,
                  index_type=index_type)
    return dbag.from_delayed([
        dask.delayed(put_part_wrapper)(
            records=part,
            collection=collection,
        ) for part in bag.to_delayed()
    ])
Exemplo n.º 2
0
def save(bag:dbag.Bag, path:Path, keep_partial_result:bool=False)->dask.delayed:
  path.mkdir(parents=True, exist_ok=True)
  save_tasks = []
  for part_idx, part in enumerate(bag.to_delayed()):
    part_path = path.joinpath(f"part-{part_idx}{EXT}")
    # if the partial result is not present, or we're not keeping partials
    if not part_path.is_file() or not keep_partial_result:
      save_tasks.append(dask.delayed(save_part)(part, part_path))
    else:
      # introduces a no-op that keeps __done__ file correct
      save_tasks.append(dask.delayed(part_path))
  return dask.delayed(write_done_file)(save_tasks, path)
Exemplo n.º 3
0
def extract_entities_and_predicates_from_sentences(
    sentence_records: dbag.Bag,
    semrep_install_dir: Path,
    unicode_to_ascii_jar_path: Path,
    work_dir: Path,
    lexicon_year: int,
    mm_data_year: str,
    mm_data_version: str,
) -> dbag.Bag:
  """Runs each sentence through SemRep. Identifies Predicates and Entities

  Requires get_metamap_server_initializer added to dask_process_global.

  Args:
    sentence_records: Each record needs `id` and `sent_text`.
    work_dir: A directory visible to all workers where SemRep intermediate files
      will be stored.
    semrep_install_dir: The path where semrep was installed.

  Returns:
    One record per input sentence, where `id` of the new record matches the
    input. However, returned records will only have `entites` and `predicates`

  """

  work_dir = Path(work_dir)
  assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}"
  semrep_input_dir = work_dir.joinpath("input_files")
  semrep_output_dir = work_dir.joinpath("output_files")
  semrep_input_dir.mkdir(exist_ok=True, parents=True)
  semrep_output_dir.mkdir(exist_ok=True, parents=True)

  semrep_tasks = []
  for part_idx, partition in enumerate(sentence_records.to_delayed()):
    semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt")
    # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml")
    semrep_tasks.append(dask.delayed(_sentence_partition_to_records)(
        records=partition,
        unicode_to_ascii_jar_path=unicode_to_ascii_jar_path,
        input_path=semrep_input_path,
        semrep_install_dir=semrep_install_dir,
        lexicon_year=lexicon_year,
        mm_data_year=mm_data_year,
        mm_data_version=mm_data_version,
    ))
  return dbag.from_delayed(semrep_tasks)
Exemplo n.º 4
0
def train_distributed_knn(
    hash_and_embedding: dbag.Bag,
    batch_size: int,
    num_centroids: int,
    num_probes: int,
    num_quantizers: int,
    bits_per_quantizer: int,
    training_sample_prob: float,
    shared_scratch_dir: Path,
    final_index_path: Path,
    id_field: str = "id",
    embedding_field: str = "embedding",
) -> Path:
    """
  Computing all of the embeddings and then performing a KNN is a problem for memory.
  So, what we need to do instead is compute batches of embeddings, and use them in Faiss
  to reduce their dimensionality and process the appropriatly.

  I'm so sorry this one function has to do so much...

  @param hash_and_embedding: bag of hash value and embedding values
  @param text_field: input text field that we embed.
  @param id_field: output id field we use to store number hashes
  @param batch_size: number of sentences per batch
  @param num_centroids: number of voronoi cells in approx nn
  @param num_probes: number of cells to consider when querying
  @param num_quantizers: number of sub-vectors to discritize
  @param bits_per_quantizer: bits per sub-vector
  @param shared_scratch_dir: location to store intermediate results.
  @param training_sample_prob: chance a point is trained on
  @return The path you can load the resulting FAISS index
  """
    init_index_path = shared_scratch_dir.joinpath("init.index")

    if not init_index_path.is_file():
        print("\t- Constructing initial index:", init_index_path)
        # First off, we need to get a representative sample for faiss training
        training_data = hash_and_embedding.random_sample(
            prob=training_sample_prob).pluck(embedding_field)

        # Train initial index, store result in init_index_path
        init_index_path = dask.compute(
            dask.delayed(train_initial_index)(
                training_data=training_data,
                num_centroids=num_centroids,
                num_probes=num_probes,
                num_quantizers=num_quantizers,
                bits_per_quantizer=bits_per_quantizer,
                output_path=init_index_path,
            ))
    else:
        print("\t- Using initial index:", init_index_path)

    # For each partition, load embeddings to idx
    partial_idx_paths = []
    for part_idx, part in enumerate(hash_and_embedding.to_delayed()):
        part_path = shared_scratch_dir.joinpath(f"part-{part_idx}.index")
        if part_path.is_file():  # rudimentary ckpt
            partial_idx_paths.append(dask.delayed(part_path))
        else:
            partial_idx_paths.append(
                dask.delayed(add_points_to_index)(
                    records=part,
                    init_index_path=init_index_path,
                    output_path=part_path,
                    batch_size=batch_size,
                ))

    return dask.delayed(merge_index)(
        init_index_path=init_index_path,
        partial_idx_paths=partial_idx_paths,
        final_index_path=final_index_path,
    )