def main(span_model_path: str, span_to_question_model_path: str, cuda_device: int, input_file: str, output_file: str, span_min_prob: float, question_min_prob: float, question_beam_size: int) -> None: check_for_gpu(cuda_device) span_model_archive = load_archive_from_folder( span_model_path, cuda_device=cuda_device, overrides= '{ "model": { "span_selector": {"span_decoding_threshold": 0.00} } }', weights_file=os.path.join(span_model_path, "best.th")) # override span detection threshold to be low enough so we can reasonably approximate bad spans # as having probability 0. span_to_question_model_archive = load_archive_from_folder( span_to_question_model_path, cuda_device=cuda_device, weights_file=os.path.join(span_to_question_model_path, "best.th")) span_model_dataset_reader_params = span_model_archive.config[ "dataset_reader"].duplicate() span_model_dataset_reader_params["qasrl_filter"]["allow_all"] = True span_to_question_model_dataset_reader_params = span_to_question_model_archive.config[ "dataset_reader"].duplicate() span_to_question_model_dataset_reader_params["qasrl_filter"][ "allow_all"] = True pipeline = AFirstPipelineSequential( span_model=span_model_archive.model, span_model_dataset_reader=DatasetReader.from_params( span_model_dataset_reader_params), span_to_question_model=span_to_question_model_archive.model, span_to_question_model_dataset_reader=DatasetReader.from_params( span_to_question_model_dataset_reader_params), span_minimum_threshold=span_min_prob, question_minimum_threshold=question_min_prob, question_beam_size=question_beam_size) if output_file is None: for line in tqdm(read_lines(cached_path(input_file))): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json)) elif output_file.endswith('.gz'): with gzip.open(output_file, 'wt') as f: for line in tqdm(read_lines(cached_path(input_file))): input_json = json.loads(line) output_json = pipeline.predict(input_json) f.write(json.dumps(output_json)) f.write('\n') else: with open(output_file, 'w', encoding='utf8') as out: for line in tqdm(read_lines(cached_path(input_file))): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json), file=out)
def get_qasrl_sentences(self, file_path: str): for line in read_lines(cached_path(file_path)): sentence_json = json.loads(line) verb_indices = [ int(k) for k, _ in sentence_json["verbEntries"].items() ] yield (sentence_json["sentenceTokens"], { "sentence_id": sentence_json["sentenceId"], "verb_indices": verb_indices })
def main(question_model_path: str, question_to_span_model_path: str, tan_model_path: str, span_to_tan_model_path: str, animacy_model_path: str, cuda_device: int, input_file: str, output_file: str, span_min_prob: float, question_min_prob: float, tan_min_prob: float, question_beam_size: int, clause_mode: bool) -> None: clause_mode = True print("Checking device...", flush = True) check_for_gpu(cuda_device) print("Loading models...", flush = True) pipeline = QFirstPipeline( question_model_archive = load_archive_from_folder(question_model_path, cuda_device = cuda_device, weights_file = os.path.join(question_model_path, "best.th")), question_to_span_model_archive = load_archive_from_folder(question_to_span_model_path, cuda_device = cuda_device, weights_file = os.path.join(question_to_span_model_path, "best.th")), tan_model_archive = load_archive_from_folder(tan_model_path, cuda_device = cuda_device, weights_file = os.path.join(tan_model_path, "best.th")) if tan_model_path is not None else None, span_to_tan_model_archive = load_archive_from_folder(span_to_tan_model_path, cuda_device = cuda_device, weights_file = os.path.join(span_to_tan_model_path, "best.th")) if span_to_tan_model_path is not None else None, animacy_model_archive = load_archive_from_folder(animacy_model_path, cuda_device = cuda_device, weights_file = os.path.join(animacy_model_path, "best.th")) if animacy_model_path is not None else None, question_minimum_threshold = question_min_prob, span_minimum_threshold = span_min_prob, tan_minimum_threshold = tan_min_prob, question_beam_size = question_beam_size, clause_mode = clause_mode) print("Models loaded. Running...", flush = True) if output_file is None: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json)) else: with open(output_file, 'w', encoding = 'utf8') as out: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(".", end = "", flush = True) print(json.dumps(output_json), file = out)
def main(model_path: str, cuda_device: int, input_file: str, output_file: str, span_min_prob: float) -> None: check_for_gpu(cuda_device) model_archive = load_archive(model_path, cuda_device=cuda_device) model_archive.model.eval() pipeline = AFirstPipelineOld( model=model_archive.model, dataset_reader=DatasetReader.from_params( model_archive.config["dataset_reader"].duplicate()), span_minimum_threshold=span_min_prob) if output_file is None: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json)) else: with open(output_file, 'w', encoding='utf8') as out: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json), file=out)
def _read(self, file_list: str): self._num_verbs = 0 self._num_instances = 0 for file_path in file_list.split(","): if file_path.strip() == "": continue logger.info("Reading QASRL instances from dataset file at: %s", file_path) for line in read_lines(cached_path(file_path)): for instance in self.sentence_json_to_instances( json.loads(line)): yield instance logger.info("Produced %d instances for %d verbs." % (self._num_instances, self._num_verbs))
def main(span_model_path: str, span_to_question_model_path: str, cuda_device: int, input_file: str, output_file: str, span_min_prob: float, question_min_prob: float, question_beam_size: int) -> None: check_for_gpu(cuda_device) span_model_archive = load_archive_from_folder(span_model_path, cuda_device=cuda_device, weights_file=os.path.join( span_model_path, "best.th")) span_to_question_model_archive = load_archive_from_folder( span_to_question_model_path, cuda_device=cuda_device, weights_file=os.path.join(span_to_question_model_path, "best.th")) pipeline = AFirstPipeline( span_model=span_model_archive.model, span_model_dataset_reader=DatasetReader.from_params( span_model_archive.config["dataset_reader"].duplicate()), span_to_question_model=span_to_question_model_archive.model, span_to_question_model_dataset_reader=DatasetReader.from_params( span_to_question_model_archive.config["dataset_reader"].duplicate( )), span_minimum_threshold=span_min_prob, question_minimum_threshold=question_min_prob, question_beam_size=question_beam_size) if output_file is None: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json)) else: with open(output_file, 'w', encoding='utf8') as out: for line in read_lines(cached_path(input_file)): input_json = json.loads(line) output_json = pipeline.predict(input_json) print(json.dumps(output_json), file=out)