def get_corrector(approach: str,
                  penalties: Optional[str],
                  insertion_penalty: float = 0,
                  deletion_penalty: float = 0):
    fwd_model_name, bid_model_name = APPROACHES2MODELS[approach]
    fwd_model = UnidirectionalLMEstimator()
    fwd_model.load(fwd_model_name)
    if bid_model_name is None:
        bid_model = None
    else:
        bid_model = BidirectionalLabelingEstimator()
        bid_model.load(bid_model_name)
    if approach == "ONE":
        p_ins, p_del = PENALTIES[approach]
    elif penalties is None:
        p_ins = insertion_penalty
        p_del = deletion_penalty
    else:
        p_ins, p_del = PENALTIES[approach][penalties]
    corrector = BatchedBeamSearchCorrector(fwd_model,
                                           insertion_penalty=-p_ins,
                                           deletion_penalty=-p_del,
                                           n_beams=5,
                                           verbose=benchmark is None,
                                           labeling_model=bid_model,
                                           add_epsilon=bid_model is not None)
    return corrector
 def __init__(self,
              model_name: str,
              labeling_model_name: Optional[str] = None,
              n_sequences: int = -1):
     self.model = UnidirectionalLMEstimator()
     self.model.load(model_name)
     self.backward = self.model.specification.backward
     self.space_label = self.model.encoder.encode_char(' ')
     self.n_sequences = n_sequences
     if labeling_model_name is None:
         self.labeling_model = None
     else:
         self.labeling_model = BidirectionalLabelingEstimator()
         self.labeling_model.load(labeling_model_name)
示例#3
0
 def __init__(self,
              model_name: str,
              insertion_threshold: float,
              deletion_threshold: float,
              model: Optional[BidirectionalLabelingEstimator] = None):
     if model is None:
         model = BidirectionalLabelingEstimator()
         model.load(model_name)
     self.model = model
     self.insertion_threshold = insertion_threshold
     self.deletion_threshold = deletion_threshold
示例#4
0
    BENCHMARKS = [benchmark_name]

    all_insertion_intervals = []
    all_deletion_intervals = []

    for benchmark in BENCHMARKS:
        cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN
        cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark)

        sequence_cases = load_object(cases_path)

        print(len(sequence_cases))

        if labeling_model_name != "0":
            from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
            labeling_model = BidirectionalLabelingEstimator()
            labeling_model.load(labeling_model_name)

        benchmark = Benchmark(benchmark, Subset.TUNING)
        case_db = []

        correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
        corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)

        for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)):
            if s_i == n:
                break

            print(benchmark.name, s_i)
            cases = sequence_cases[s_i]
            case_db.append([])
                penalties[i] += value
            else:
                penalties[i] *= value
    return penalties


if __name__ == "__main__":
    model_name = parameters["model"]

    model = UnidirectionalModel(model_name)
    backward = model.model.specification.backward

    if parameters["labeling_model"] == "0":
        labeling_model = None
    else:
        labeling_model = BidirectionalLabelingEstimator()
        labeling_model.load(parameters["labeling_model"])

    benchmark_name = parameters["benchmark"]

    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        benchmark = Benchmark(benchmark_name, get_subset(parameters["subset"]))
        if parameters["sequences"] == "corrupt":
            sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        else:
            sequences = benchmark.get_predicted_sequences(
                parameters["sequences"])
        file_writer = PredictionsFileWriter(benchmark.get_results_directory() +
    recurrent_units = [1024]
    dense_units = [1024]
    seq_len = 256
    batch_size = parameters["batch_size"]
    noise = parameters["noise"]
    start_batch = parameters["start_batch"]

    if parameters["vocabulary"] == "arxiv":
        encoder = get_arxiv_encoder()
    elif parameters["vocabulary"] == "mixed":
        encoder = get_mixed_encoder()
    else:
        vocab_size = int(parameters["vocabulary"])
        encoder = get_encoder(vocab_size)

    model = BidirectionalLabelingEstimator()

    if start_batch == 0:
        spec = BidirectionalLabelingEstimatorSpecification(recurrent_units=recurrent_units,
                                                           dense_units=dense_units,
                                                           dim=encoder.dim(),
                                                           name=name)

        model.initialize(spec, encoder)
    else:
        model.load(name)
        print("Model loaded.")

    noise_inducer = None
    if parameters["noise"] == "ocr":
        noise_inducer = OCRNoiseInducer(p=0.05, seed=1337)
import sys

import project

from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
from src.interactive.sequence_generator import interactive_sequence_generator

if __name__ == "__main__":
    model = BidirectionalLabelingEstimator()
    model.load(sys.argv[1])
    threshold = float(sys.argv[2])
    for sequence in interactive_sequence_generator():
        sequence = sequence.replace(' ', '')
        result = model.predict(sequence)
        probabilities = result["probabilities"][1:]
        predicted = ""
        for char, p in zip(sequence, probabilities):
            print(char, p)
            predicted += char
            if p > threshold:
                predicted += ' '
        print(predicted)
if __name__ == "__main__":
    in_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/"
    #in_dir = "/nfs/datasets/tokenization-repair/acl-anthology/raw/"
    process_id = int(sys.argv[1])
    processes = 5
    random.seed(42)

    files = os.listdir(in_dir)
    random.shuffle(files)
    files_per_process = math.ceil(len(files) / processes)
    files = sorted(files[(process_id * files_per_process):((process_id + 1) *
                                                           files_per_process)])

    uni_model = UnidirectionalModel("conll.fwd1024.ocr+spelling")
    bid_model = BidirectionalLabelingEstimator()
    bid_model.load("conll.labeling.ocr+spelling")

    corrector = BatchedBeamSearchCorrector(uni_model.model,
                                           insertion_penalty=-7.6,
                                           deletion_penalty=-9.2,
                                           n_beams=5,
                                           verbose=False,
                                           labeling_model=bid_model,
                                           add_epsilon=True)

    out_dir = in_dir[:-1] + ".repaired_hyphens/"
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    for file in files:
示例#9
0
def load_bidirectional_model(robust: bool) -> BidirectionalLabelingEstimator:
    name = model_names.bidirectional_model_name(robust)
    model = BidirectionalLabelingEstimator()
    model.load(name)
    return model
class PenaltyFitter:
    def __init__(self,
                 model_name: str,
                 labeling_model_name: Optional[str] = None,
                 n_sequences: int = -1):
        self.model = UnidirectionalLMEstimator()
        self.model.load(model_name)
        self.backward = self.model.specification.backward
        self.space_label = self.model.encoder.encode_char(' ')
        self.n_sequences = n_sequences
        if labeling_model_name is None:
            self.labeling_model = None
        else:
            self.labeling_model = BidirectionalLabelingEstimator()
            self.labeling_model.load(labeling_model_name)

    def space_and_nospace_probabilities(
            self,
            state: Dict,
            nospace_label: int,
            bidir_space_prob: Optional[float] = None):
        p_space = state["probabilities"][self.space_label]
        p_other = state["probabilities"][nospace_label]
        state_after_space = self.model.step(state,
                                            self.space_label,
                                            include_sequence=False)
        p_other_given_space = state_after_space["probabilities"][nospace_label]
        p_space_total = p_space * p_other_given_space
        if bidir_space_prob is not None:
            p_space_total = p_space_total * bidir_space_prob
            p_other = p_other * (1 - bidir_space_prob)
        return p_space_total, p_other

    @staticmethod
    def optimal_value(penalty_case_pairs, minimize_errors=False):
        penalty_case_pairs = sorted(penalty_case_pairs, reverse=True)
        total_true_positives = sum([
            1 for _, case in penalty_case_pairs if case == Case.TRUE_POSITIVE
        ])
        penalty_case_pairs = [(penalty, case)
                              for penalty, case in penalty_case_pairs
                              if penalty > 0]
        penalties = [np.inf] + [t for t, _ in penalty_case_pairs]
        tp_vec = [0]
        fp_vec = [0]
        tps = 0
        fps = 0
        for _, case in penalty_case_pairs:
            if case == Case.TRUE_POSITIVE:
                tps += 1
            else:
                fps += 1
            tp_vec.append(tps)
            fp_vec.append(fps)
        tp_vec = np.asarray(tp_vec)
        fp_vec = np.asarray(fp_vec)
        if minimize_errors:
            fn_vec = total_true_positives - tp_vec
            errors = fn_vec + fp_vec
            best = int(np.argmin(errors))
            print("best errors=%i@%f (%i TP, %i FP, %i FN)" %
                  (errors[best], penalties[best], tp_vec[best], fp_vec[best],
                   fn_vec[best]))
        else:
            precision = tp_vec / (tp_vec + fp_vec)
            recall = tp_vec / total_true_positives
            f1 = [
                2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
                for prec, rec in zip(precision, recall)
            ]
            best = int(np.argmax(f1))
            print("best f1=%f@%f (precision=%f, recall=%f)" %
                  (f1[best], penalties[best], precision[best], recall[best]))
        return penalties[best]