Пример #1
0
def main(span_model_path: str, span_to_question_model_path: str,
         cuda_device: int, input_file: str, output_file: str,
         span_min_prob: float, question_min_prob: float,
         question_beam_size: int) -> None:

    check_for_gpu(cuda_device)

    span_model_archive = load_archive_from_folder(
        span_model_path,
        cuda_device=cuda_device,
        overrides=
        '{ "model": { "span_selector": {"span_decoding_threshold": 0.00} } }',
        weights_file=os.path.join(span_model_path, "best.th"))

    # override span detection threshold to be low enough so we can reasonably approximate bad spans
    # as having probability 0.
    span_to_question_model_archive = load_archive_from_folder(
        span_to_question_model_path,
        cuda_device=cuda_device,
        weights_file=os.path.join(span_to_question_model_path, "best.th"))

    span_model_dataset_reader_params = span_model_archive.config[
        "dataset_reader"].duplicate()
    span_model_dataset_reader_params["qasrl_filter"]["allow_all"] = True

    span_to_question_model_dataset_reader_params = span_to_question_model_archive.config[
        "dataset_reader"].duplicate()
    span_to_question_model_dataset_reader_params["qasrl_filter"][
        "allow_all"] = True

    pipeline = AFirstPipelineSequential(
        span_model=span_model_archive.model,
        span_model_dataset_reader=DatasetReader.from_params(
            span_model_dataset_reader_params),
        span_to_question_model=span_to_question_model_archive.model,
        span_to_question_model_dataset_reader=DatasetReader.from_params(
            span_to_question_model_dataset_reader_params),
        span_minimum_threshold=span_min_prob,
        question_minimum_threshold=question_min_prob,
        question_beam_size=question_beam_size)
    if output_file is None:
        for line in tqdm(read_lines(cached_path(input_file))):
            input_json = json.loads(line)
            output_json = pipeline.predict(input_json)
            print(json.dumps(output_json))
    elif output_file.endswith('.gz'):
        with gzip.open(output_file, 'wt') as f:
            for line in tqdm(read_lines(cached_path(input_file))):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                f.write(json.dumps(output_json))
                f.write('\n')
    else:
        with open(output_file, 'w', encoding='utf8') as out:
            for line in tqdm(read_lines(cached_path(input_file))):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                print(json.dumps(output_json), file=out)
Пример #2
0
 def get_qasrl_sentences(self, file_path: str):
     for line in read_lines(cached_path(file_path)):
         sentence_json = json.loads(line)
         verb_indices = [
             int(k) for k, _ in sentence_json["verbEntries"].items()
         ]
         yield (sentence_json["sentenceTokens"], {
             "sentence_id": sentence_json["sentenceId"],
             "verb_indices": verb_indices
         })
Пример #3
0
def main(question_model_path: str,
         question_to_span_model_path: str,
         tan_model_path: str,
         span_to_tan_model_path: str,
         animacy_model_path: str,
         cuda_device: int,
         input_file: str,
         output_file: str,
         span_min_prob: float,
         question_min_prob: float,
         tan_min_prob: float,
         question_beam_size: int,
         clause_mode: bool) -> None:
    clause_mode = True
    print("Checking device...", flush = True)
    check_for_gpu(cuda_device)
    print("Loading models...", flush = True)
    pipeline = QFirstPipeline(
        question_model_archive = load_archive_from_folder(question_model_path, cuda_device = cuda_device, weights_file = os.path.join(question_model_path, "best.th")),
        question_to_span_model_archive = load_archive_from_folder(question_to_span_model_path, cuda_device = cuda_device, weights_file = os.path.join(question_to_span_model_path, "best.th")),
        tan_model_archive = load_archive_from_folder(tan_model_path, cuda_device = cuda_device, weights_file = os.path.join(tan_model_path, "best.th")) if tan_model_path is not None else None,
        span_to_tan_model_archive = load_archive_from_folder(span_to_tan_model_path, cuda_device = cuda_device, weights_file = os.path.join(span_to_tan_model_path, "best.th")) if span_to_tan_model_path is not None else None,
        animacy_model_archive = load_archive_from_folder(animacy_model_path, cuda_device = cuda_device, weights_file = os.path.join(animacy_model_path, "best.th")) if animacy_model_path is not None else None,
        question_minimum_threshold = question_min_prob,
        span_minimum_threshold = span_min_prob,
        tan_minimum_threshold = tan_min_prob,
        question_beam_size = question_beam_size,
        clause_mode = clause_mode)
    print("Models loaded. Running...", flush = True)
    if output_file is None:
        for line in read_lines(cached_path(input_file)):
            input_json = json.loads(line)
            output_json = pipeline.predict(input_json)
            print(json.dumps(output_json))
    else:
        with open(output_file, 'w', encoding = 'utf8') as out:
            for line in read_lines(cached_path(input_file)):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                print(".", end = "", flush = True)
                print(json.dumps(output_json), file = out)
Пример #4
0
def main(model_path: str, cuda_device: int, input_file: str, output_file: str,
         span_min_prob: float) -> None:
    check_for_gpu(cuda_device)
    model_archive = load_archive(model_path, cuda_device=cuda_device)
    model_archive.model.eval()
    pipeline = AFirstPipelineOld(
        model=model_archive.model,
        dataset_reader=DatasetReader.from_params(
            model_archive.config["dataset_reader"].duplicate()),
        span_minimum_threshold=span_min_prob)
    if output_file is None:
        for line in read_lines(cached_path(input_file)):
            input_json = json.loads(line)
            output_json = pipeline.predict(input_json)
            print(json.dumps(output_json))
    else:
        with open(output_file, 'w', encoding='utf8') as out:
            for line in read_lines(cached_path(input_file)):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                print(json.dumps(output_json), file=out)
Пример #5
0
 def _read(self, file_list: str):
     self._num_verbs = 0
     self._num_instances = 0
     for file_path in file_list.split(","):
         if file_path.strip() == "":
             continue
         logger.info("Reading QASRL instances from dataset file at: %s",
                     file_path)
         for line in read_lines(cached_path(file_path)):
             for instance in self.sentence_json_to_instances(
                     json.loads(line)):
                 yield instance
     logger.info("Produced %d instances for %d verbs." %
                 (self._num_instances, self._num_verbs))
Пример #6
0
def main(span_model_path: str, span_to_question_model_path: str,
         cuda_device: int, input_file: str, output_file: str,
         span_min_prob: float, question_min_prob: float,
         question_beam_size: int) -> None:
    check_for_gpu(cuda_device)
    span_model_archive = load_archive_from_folder(span_model_path,
                                                  cuda_device=cuda_device,
                                                  weights_file=os.path.join(
                                                      span_model_path,
                                                      "best.th"))
    span_to_question_model_archive = load_archive_from_folder(
        span_to_question_model_path,
        cuda_device=cuda_device,
        weights_file=os.path.join(span_to_question_model_path, "best.th"))
    pipeline = AFirstPipeline(
        span_model=span_model_archive.model,
        span_model_dataset_reader=DatasetReader.from_params(
            span_model_archive.config["dataset_reader"].duplicate()),
        span_to_question_model=span_to_question_model_archive.model,
        span_to_question_model_dataset_reader=DatasetReader.from_params(
            span_to_question_model_archive.config["dataset_reader"].duplicate(
            )),
        span_minimum_threshold=span_min_prob,
        question_minimum_threshold=question_min_prob,
        question_beam_size=question_beam_size)
    if output_file is None:
        for line in read_lines(cached_path(input_file)):
            input_json = json.loads(line)
            output_json = pipeline.predict(input_json)
            print(json.dumps(output_json))
    else:
        with open(output_file, 'w', encoding='utf8') as out:
            for line in read_lines(cached_path(input_file)):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                print(json.dumps(output_json), file=out)