def get_corrector(approach: str, penalties: Optional[str], insertion_penalty: float = 0, deletion_penalty: float = 0): fwd_model_name, bid_model_name = APPROACHES2MODELS[approach] fwd_model = UnidirectionalLMEstimator() fwd_model.load(fwd_model_name) if bid_model_name is None: bid_model = None else: bid_model = BidirectionalLabelingEstimator() bid_model.load(bid_model_name) if approach == "ONE": p_ins, p_del = PENALTIES[approach] elif penalties is None: p_ins = insertion_penalty p_del = deletion_penalty else: p_ins, p_del = PENALTIES[approach][penalties] corrector = BatchedBeamSearchCorrector(fwd_model, insertion_penalty=-p_ins, deletion_penalty=-p_del, n_beams=5, verbose=benchmark is None, labeling_model=bid_model, add_epsilon=bid_model is not None) return corrector
def __init__(self, model_name: str, labeling_model_name: Optional[str] = None, n_sequences: int = -1): self.model = UnidirectionalLMEstimator() self.model.load(model_name) self.backward = self.model.specification.backward self.space_label = self.model.encoder.encode_char(' ') self.n_sequences = n_sequences if labeling_model_name is None: self.labeling_model = None else: self.labeling_model = BidirectionalLabelingEstimator() self.labeling_model.load(labeling_model_name)
def __init__(self, model_name: str, insertion_threshold: float, deletion_threshold: float, model: Optional[BidirectionalLabelingEstimator] = None): if model is None: model = BidirectionalLabelingEstimator() model.load(model_name) self.model = model self.insertion_threshold = insertion_threshold self.deletion_threshold = deletion_threshold
BENCHMARKS = [benchmark_name] all_insertion_intervals = [] all_deletion_intervals = [] for benchmark in BENCHMARKS: cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark) sequence_cases = load_object(cases_path) print(len(sequence_cases)) if labeling_model_name != "0": from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator labeling_model = BidirectionalLabelingEstimator() labeling_model.load(labeling_model_name) benchmark = Benchmark(benchmark, Subset.TUNING) case_db = [] correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)): if s_i == n: break print(benchmark.name, s_i) cases = sequence_cases[s_i] case_db.append([])
penalties[i] += value else: penalties[i] *= value return penalties if __name__ == "__main__": model_name = parameters["model"] model = UnidirectionalModel(model_name) backward = model.model.specification.backward if parameters["labeling_model"] == "0": labeling_model = None else: labeling_model = BidirectionalLabelingEstimator() labeling_model.load(parameters["labeling_model"]) benchmark_name = parameters["benchmark"] if benchmark_name == "0": sequences = interactive_sequence_generator() file_writer = None else: benchmark = Benchmark(benchmark_name, get_subset(parameters["subset"])) if parameters["sequences"] == "corrupt": sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) else: sequences = benchmark.get_predicted_sequences( parameters["sequences"]) file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
recurrent_units = [1024] dense_units = [1024] seq_len = 256 batch_size = parameters["batch_size"] noise = parameters["noise"] start_batch = parameters["start_batch"] if parameters["vocabulary"] == "arxiv": encoder = get_arxiv_encoder() elif parameters["vocabulary"] == "mixed": encoder = get_mixed_encoder() else: vocab_size = int(parameters["vocabulary"]) encoder = get_encoder(vocab_size) model = BidirectionalLabelingEstimator() if start_batch == 0: spec = BidirectionalLabelingEstimatorSpecification(recurrent_units=recurrent_units, dense_units=dense_units, dim=encoder.dim(), name=name) model.initialize(spec, encoder) else: model.load(name) print("Model loaded.") noise_inducer = None if parameters["noise"] == "ocr": noise_inducer = OCRNoiseInducer(p=0.05, seed=1337)
import sys import project from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator from src.interactive.sequence_generator import interactive_sequence_generator if __name__ == "__main__": model = BidirectionalLabelingEstimator() model.load(sys.argv[1]) threshold = float(sys.argv[2]) for sequence in interactive_sequence_generator(): sequence = sequence.replace(' ', '') result = model.predict(sequence) probabilities = result["probabilities"][1:] predicted = "" for char, p in zip(sequence, probabilities): print(char, p) predicted += char if p > threshold: predicted += ' ' print(predicted)
if __name__ == "__main__": in_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/" #in_dir = "/nfs/datasets/tokenization-repair/acl-anthology/raw/" process_id = int(sys.argv[1]) processes = 5 random.seed(42) files = os.listdir(in_dir) random.shuffle(files) files_per_process = math.ceil(len(files) / processes) files = sorted(files[(process_id * files_per_process):((process_id + 1) * files_per_process)]) uni_model = UnidirectionalModel("conll.fwd1024.ocr+spelling") bid_model = BidirectionalLabelingEstimator() bid_model.load("conll.labeling.ocr+spelling") corrector = BatchedBeamSearchCorrector(uni_model.model, insertion_penalty=-7.6, deletion_penalty=-9.2, n_beams=5, verbose=False, labeling_model=bid_model, add_epsilon=True) out_dir = in_dir[:-1] + ".repaired_hyphens/" if not os.path.exists(out_dir): os.mkdir(out_dir) for file in files:
def load_bidirectional_model(robust: bool) -> BidirectionalLabelingEstimator: name = model_names.bidirectional_model_name(robust) model = BidirectionalLabelingEstimator() model.load(name) return model
class PenaltyFitter: def __init__(self, model_name: str, labeling_model_name: Optional[str] = None, n_sequences: int = -1): self.model = UnidirectionalLMEstimator() self.model.load(model_name) self.backward = self.model.specification.backward self.space_label = self.model.encoder.encode_char(' ') self.n_sequences = n_sequences if labeling_model_name is None: self.labeling_model = None else: self.labeling_model = BidirectionalLabelingEstimator() self.labeling_model.load(labeling_model_name) def space_and_nospace_probabilities( self, state: Dict, nospace_label: int, bidir_space_prob: Optional[float] = None): p_space = state["probabilities"][self.space_label] p_other = state["probabilities"][nospace_label] state_after_space = self.model.step(state, self.space_label, include_sequence=False) p_other_given_space = state_after_space["probabilities"][nospace_label] p_space_total = p_space * p_other_given_space if bidir_space_prob is not None: p_space_total = p_space_total * bidir_space_prob p_other = p_other * (1 - bidir_space_prob) return p_space_total, p_other @staticmethod def optimal_value(penalty_case_pairs, minimize_errors=False): penalty_case_pairs = sorted(penalty_case_pairs, reverse=True) total_true_positives = sum([ 1 for _, case in penalty_case_pairs if case == Case.TRUE_POSITIVE ]) penalty_case_pairs = [(penalty, case) for penalty, case in penalty_case_pairs if penalty > 0] penalties = [np.inf] + [t for t, _ in penalty_case_pairs] tp_vec = [0] fp_vec = [0] tps = 0 fps = 0 for _, case in penalty_case_pairs: if case == Case.TRUE_POSITIVE: tps += 1 else: fps += 1 tp_vec.append(tps) fp_vec.append(fps) tp_vec = np.asarray(tp_vec) fp_vec = np.asarray(fp_vec) if minimize_errors: fn_vec = total_true_positives - tp_vec errors = fn_vec + fp_vec best = int(np.argmin(errors)) print("best errors=%i@%f (%i TP, %i FP, %i FN)" % (errors[best], penalties[best], tp_vec[best], fp_vec[best], fn_vec[best])) else: precision = tp_vec / (tp_vec + fp_vec) recall = tp_vec / total_true_positives f1 = [ 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0 for prec, rec in zip(precision, recall) ] best = int(np.argmax(f1)) print("best f1=%f@%f (precision=%f, recall=%f)" % (f1[best], penalties[best], precision[best], recall[best])) return penalties[best]