def run_oie(lines, batch_size=1, debug=False): """ Run the OIE model and process the output. """ if debug: logging.basicConfig(level = logging.DEBUG) else: logging.basicConfig(level = logging.INFO) # Init OIE #model = open_information_extraction_stanovsky_2018() model = PretrainedModel('model_final.tar.gz', 'open-information-extraction') model = model.predictor() # type: ignore # process sentences logging.info("Processing sentences") oie_lines = [] for chunk in tqdm(chunks(lines, batch_size)): oie_inputs = [] for sent in chunk: oie_inputs.extend(create_instances(model, sent)) if not oie_inputs: # No predicates in this sentence continue # Run oie on sents sent_preds = model.predict_batch_json(oie_inputs) # Collect outputs in batches predictions_by_sent = defaultdict(list) for outputs in sent_preds: sent_tokens = outputs["words"] tags = outputs["tags"] sent_str = " ".join(sent_tokens) assert(len(sent_tokens) == len(tags)) predictions_by_sent[sent_str].append((outputs["tags"], outputs["class_probabilities"])) # Create extractions by sentence for sent_tokens, predictions_for_sent in predictions_by_sent.items(): raw_tags = list(map(itemgetter(0), predictions_for_sent)) class_probs = list(map(itemgetter(1), predictions_for_sent)) # Compute confidence per extraction confs = [get_confidence(model, tag_per_token, class_prob) for tag_per_token, class_prob in zip(raw_tags, class_probs)] extractions, tags = format_extractions([Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags) oie_lines.extend([extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs)]) logging.info("DONE") return oie_lines
sent_str = " ".join(sent_tokens) assert (len(sent_tokens) == len(tags)) predictions_by_sent[sent_str].append( (outputs["tags"], outputs["class_probabilities"])) # Create extractions by sentence for sent_tokens, predictions_for_sent in predictions_by_sent.items(): raw_tags = list(map(itemgetter(0), predictions_for_sent)) class_probs = list(map(itemgetter(1), predictions_for_sent)) # Compute confidence per extraction confs = [ get_confidence(model, tag_per_token, class_prob) for tag_per_token, class_prob in zip(raw_tags, class_probs) ] extractions, tags = format_extractions( [Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags) oie_lines.extend([ extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs) ]) t2 = time.perf_counter() print("E2E time: ", t2 - t1) # Write to file logging.info(f"Writing output to {out_fn}") with open(out_fn, "w", encoding="utf8") as fout: fout.write("\n".join(oie_lines)) logging.info("DONE")