import project from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter("model_name", "-m", "str"), Parameter("labeling", "-labeling", "str"), Parameter("benchmark", "-b", "str"), Parameter("sequences", "-seq", "str"), Parameter("lookahead", "-l", "int") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import numpy as np from src.helper.pickle import load_object from src.helper.data_structures import izip from src.settings import paths from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.corrector.beam_search.penalty_fitter import Case as CaseLabel, PenaltyFitter from src.corrector.beam_search.penalty_holder import PenaltyHolder if __name__ == "__main__": model_name = parameters["model_name"] benchmark_name = parameters["benchmark"] lookahead = parameters["lookahead"] sequence_file = parameters["sequences"] cases_path = paths.CASES_FILE_NOISY if benchmark_name.startswith( "0.1") else paths.CASES_FILE_CLEAN
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model_name", "-name", "str"), Parameter("noise", "-noise", "float"), Parameter("start_batch", "-start", "int") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import tensorflow as tf from src.encoding.character_encoder import get_encoder from src.data_fn.robust_data_fn_provicer import RobustDataFnProvider from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimatorSpecification, UnidirectionalLMEstimator if __name__ == "__main__": tf.logging.set_verbosity(tf.logging.INFO) name = parameters["model_name"] vocab_size = 200 recurrent_units = [1024] dense_units = [1024] seq_len = 256 batch_size = 128 noise = parameters["noise"] start_batch = parameters["start_batch"]
from project import src from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model_name", "-name", "str"), Parameter("direction", "-dir", "str", help_message="Choose from {fwd, bwd, bidir}.", dependencies=[("bidir", Parameter("sigmoidal", "-sigmoid", "boolean"))]), Parameter("vocabulary", "-voc", "str"), Parameter("recurrent_units", "-ru", "int"), Parameter("dense_units", "-du", "int"), Parameter("dataset", "-data", "str"), Parameter("epochs", "-e", "int", default=1), Parameter("batch_size", "-bs", "int"), Parameter("sequence_length", "-len", "int"), Parameter("noise", "-noise", "str"), Parameter("start_batch", "-start", "int"), Parameter("steps", "-steps", "int"), Parameter("keep_n_checkpoints", "-keep", "int"), Parameter("keep_every_hours", "-every", "int") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import tensorflow as tf
import project from src.interactive.parameters import Parameter, ParameterGetter params = [Parameter("model_name", "-m", "str"), Parameter("benchmark", "-b", "str"), Parameter("n_sequences", "-n", "int")] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimator from src.helper.pickle import dump_object, load_object from src.settings import paths from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.corrector.beam_search.penalty_tuning import Case if __name__ == "__main__": model_name = parameters["model_name"] path = paths.CASES_FILE_CLEAN path = path % (model_name, parameters["benchmark"]) benchmark_name = parameters["benchmark"] n_sequences = parameters["n_sequences"] LOOKAHEAD = 2 model = UnidirectionalLMEstimator() model.load(model_name)
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model_type", "-type", "str", help_message="Choose from {fwd, bwd, bidir, combined, softmax_old, sigmoid_old, combined_old}.", dependencies=[ ("combined", [Parameter("bwd_model_name", "-bwd", "str")]), ("combined_old", [Parameter("bwd_model_name", "-bwd", "str")]) ]), Parameter("model_name", "-name", "str"), Parameter("noise_type", "-noise", "str"), Parameter("benchmark", "-benchmark", "str"), Parameter("out_file", "-f", "str"), Parameter("initialize", "-init", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.load.load_char_lm import load_char_lm from src.corrector.threshold_holder import ThresholdHolder from src.corrector.greedy_corrector import GreedyCorrector from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.interactive.sequence_generator import interactive_sequence_generator from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import timestamp, time_diff
import project from src.interactive.parameters import Parameter, ParameterGetter params = [Parameter("model_name", "-name", "str"), Parameter("vocabulary", "-voc", "str"), Parameter("dataset", "-data", "str"), Parameter("noise", "-noise", "str"), Parameter("batch_size", "-bs", "int"), Parameter("epochs", "-e", "int"), Parameter("start_batch", "-start", "int")] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import tensorflow as tf from src.encoding.character_encoder import get_encoder, get_arxiv_encoder, get_mixed_encoder from src.data_fn.robust_data_fn_provicer import RobustDataFnProvider from src.data_fn.acl_robust_data_fn_provider import ACLRobustDataFnProvider from src.data_fn.arxiv_robust_data_fn_provider import ArxivRobustDataFnProvider from src.data_fn.file_reader_robust_data_fn_provider import FileReaderRobustDataFnProvider from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator, \ BidirectionalLabelingEstimatorSpecification from src.noise.token_typo_inducer import TokenTypoInducer from src.noise.ocr_noise_inducer import OCRNoiseInducer from src.noise.char_and_punctuation_noise_inducer import CharAndPunctuationNoiseInducer from src.noise.acl_noise_inducer import ACLNoiseInducer
import project from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter("n_tokens", "-n", "int"), Parameter("benchmark", "-b", "str"), Parameter("test", "-t", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.corrector.baselines.maximum_matching_corrector import MaximumMatchingCorrector from src.interactive.sequence_generator import interactive_sequence_generator from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp if __name__ == "__main__": n_tokens = parameters["n_tokens"] corrector = MaximumMatchingCorrector(n=n_tokens) benchmark_name = parameters["benchmark"] if benchmark_name == "0": sequences = interactive_sequence_generator() file_writer = None else: subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT benchmark = Benchmark(benchmark_name, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) file = benchmark.get_results_directory(
from project import src from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter("name", "-name", "str"), Parameter("corruption_probability", "-p", "float"), Parameter("noise_probability", "-noise", "float") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.datasets.wikipedia import Wikipedia from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.noise.typo_noise_inducer import TypoNoiseInducer from src.sequence.token_corruptor import TokenCorruptor from src.settings import constants from src.helper.files import open_file if __name__ == "__main__": SEED = 3010 p = parameters["corruption_probability"] # create empty benchmarks benchmark_name = parameters["name"] development_benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT) test_benchmark = Benchmark(benchmark_name, Subset.TEST) # read sequences development_sequences = Wikipedia.development_sequences()
from project import src from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model_type", "-type", "str", help_message="Choose 'bidir' or 'combined'.", dependencies=[ ("bidir", [ Parameter("model_name", "-name", "str", help_message="Name of the model.") ]), ("combined", [ Parameter("fwd_model_name", "-fwd", "str", help_message="Name of the forward model."), Parameter("bwd_model_name", "-bwd", "str", help_message="Name of the backward model.") ]) ]), Parameter("benchmark", "-benchmark", "str"), Parameter("noise", "-noise", "str"), Parameter("insert", "-insert", "boolean"), Parameter("initialize", "-init", "boolean"), Parameter("threshold", "-t", "float")
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("approach", "-a", "str"), Parameter("noise_level", "-n", "float"), Parameter("two_pass", "-tp", "str") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from enum import Enum import numpy as np from src.load.load_char_lm import load_default_char_lm from src.benchmark.benchmark import Subset, BenchmarkFiles, get_benchmark from src.benchmark.two_pass_benchmark import get_two_pass_benchmark from src.sequence.transformation import space_corruption_positions from src.optimization.threshold import optimal_f1_threshold from src.corrector.threshold_holder import ThresholdHolder, FittingMethod, ThresholdType class OperationType(Enum): INSERTION = 0 DELETION = 1 class PredictionType(Enum): FALSE_POSITIVE = 0 TRUE_POSITIVE = 1
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("benchmark", "-benchmark", "str"), Parameter("subset", "-set", "str"), Parameter("n_sequences", "-n", "int"), Parameter("n_beams", "-b", "int"), Parameter("space_penalty", "-sp", "float"), Parameter("char_penalty", "-cp", "float"), Parameter("out_file", "-f", "str"), Parameter("segmentation_file", "-seg", "str"), Parameter("continue", "-c", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.interactive.sequence_generator import interactive_sequence_generator from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimator from src.spelling.spelling_beam_search_corrector import SpellingBeamSearchCorrector from src.benchmark.benchmark import BenchmarkFiles, SUBSETS, Benchmark from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp if __name__ == "__main__": model = UnidirectionalLMEstimator() model.load("fwd1024") corrector = SpellingBeamSearchCorrector( model, n_beams=parameters["n_beams"],
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("noise_level", "-n", "float"), Parameter("approach", "-a", "str") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import numpy as np from src.evaluation.results_holder import ResultsHolder, Metric from src.benchmark.benchmark import get_benchmark_name, get_error_probabilities, Subset if __name__ == "__main__": approach = parameters["approach"] holder = ResultsHolder() metrics = [Metric.F1, Metric.SEQUENCE_ACCURACY, Metric.MEAN_RUNTIME] values = {metric: [] for metric in metrics} for p in get_error_probabilities(): benchmark_name = get_benchmark_name(parameters["noise_level"], p) benchmark_values = [] for metric in metrics: value = holder.get(benchmark_name, Subset.TEST, approach, metric) benchmark_values.append(value) values[metric].append(value) print_name = benchmark_name[:7] print_name += ' ' * (7 - len(print_name)) print(print_name, ' '.join(str(value) for value in benchmark_values))
import project from src.interactive.parameters import Parameter, ParameterGetter import matplotlib.gridspec as gridspec params = [ Parameter("metric", "-m", "str", help_message="Choose metric(s) from {f1, acc, t}."), Parameter("approaches", "-a", "str"), Parameter("noise_level", "-n", "float"), Parameter("file_name", "-o", "str") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import numpy as np import matplotlib.pyplot as plt from src.evaluation.results_holder import Metric, ResultsHolder from src.benchmark.benchmark import Subset, get_benchmark_name, get_error_probabilities METRICS = { "f1": Metric.F1, "acc": Metric.SEQUENCE_ACCURACY, "t": Metric.MEAN_RUNTIME } Y_LABELS = { Metric.F1: "F-score",
import project from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter( "model", "-m", "str", help_message= "Choose a model out of {combined, combined_robust, softmax, softmax_robust, sigmoid, " "sigmoid_robust}."), Parameter("benchmark", "-b", "str", dependencies=[("0", [Parameter("noise_type", "-n", "str")])]), Parameter("test", "-t", "boolean"), Parameter("two_pass", "-tp", "str"), Parameter("continue", "-c", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles from src.benchmark.two_pass_benchmark import TwoPassBenchmark from src.corrector.iterative_window_corrector import IterativeWindowCorrector from src.corrector.threshold_holder import ThresholdHolder, FittingMethod from src.load.load_char_lm import load_default_char_lm from src.interactive.sequence_generator import interactive_sequence_generator from src.helper.time import time_diff, timestamp from src.evaluation.predictions_file_writer import PredictionsFileWriter
""" Evaluates a language model's performance. """ from project import src from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model_type", "-type", "str", help_message="Choose from {fwd, bwd, bidir, combined}.", dependencies=[("combined", [Parameter("bwd_model_name", "-bwd", "str")])]), Parameter("model_name", "-name", "str"), Parameter( "benchmark", "-b", "str", help_message= "Choose from {new, old}. 'old' is the project dev set and 'new' the paper dev set." ), Parameter("n_sequences", "-n", "int"), Parameter("interactive", "-i", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.datasets.wikipedia import Wikipedia from src.load.load_char_lm import load_char_lm
import project from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter("bigram_postprocessing", "-bi", "boolean"), Parameter("interactive", "-i", "boolean"), Parameter("test", "-t", "boolean"), Parameter("file_name", "-f", "str"), Parameter("verbose", "-v", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() import numpy as np from src.interactive.sequence_generator import interactive_sequence_generator from src.baselines.dynamic_programming import DynamicProgrammingCorrector from src.benchmark.benchmark import Subset, BenchmarkFiles, NOISE_LEVELS, get_benchmark from src.helper.time import time_diff, timestamp from src.evaluation.predictions_file_writer import PredictionsFileWriter if __name__ == "__main__": corrector = DynamicProgrammingCorrector( bigram_postprocessing=parameters["bigram_postprocessing"], allow_len_1=False, minimize_token_number=False) if parameters["interactive"]: benchmarks = [None] else:
from project import src from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("benchmark", "-b", "str", help_message="Name of the benchmark."), Parameter("set", "-set", "str"), Parameter("sequences", "-n", "int"), Parameter("file", "-f", "str", help_message="Name of the file containing predicted sequences.") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.benchmark.benchmark import Benchmark, BenchmarkFiles, SUBSETS from src.evaluation.evaluator import Evaluator from src.helper.data_structures import izip from src.evaluation.print_methods import print_evaluator if __name__ == "__main__": benchmark_name = parameters["benchmark"] benchmark_subset = SUBSETS[parameters["set"]] benchmark = Benchmark(benchmark_name, subset=benchmark_subset) sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if parameters["file"] == "corrupt.txt": predicted_sequences = [corrupt for _, corrupt in sequence_pairs] else: predicted_sequences = benchmark.get_predicted_sequences(
from project import src from src.interactive.parameters import ParameterGetter, Parameter params = [ Parameter("benchmark", "-benchmark", "str", help_message="Name of the benchmark."), Parameter("n_sequences", "-n", "int"), Parameter("model_name", "-model", "str", help_message="Name of the unidirectional model to be used."), Parameter("backward", "-bwd", "boolean", help_message="Set 1 if the model is a backward model."), Parameter("n_beams", "-b", "int", help_message="Number of beams for beam search."), Parameter("average_log_likelihood", "-avg", "boolean", help_message="Set 1 to divide beam scores by sequence length."), Parameter( "penalty", "-p", "str", help_message="Penalty for edits, on probability scale between 0 and 1." + " Type a benchmark name to use pre-optimized threshold."),
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("model", "-m", "str"), Parameter("benchmark", "-b", "str"), Parameter("subset", "-set", "str"), Parameter("sequences", "-seq", "str"), Parameter("n_sequences", "-n", "int"), Parameter("continue", "-c", "boolean"), Parameter("beams", "-w", "int"), Parameter("penalties", "-p", "str"), Parameter("penalty_modifier", "-pm", "str"), Parameter("out_file", "-f", "str"), Parameter("labeling_model", "-labeling", "str"), Parameter("lookahead", "-l", "int") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.models.char_lm.unidirectional_model import UnidirectionalModel from src.benchmark.benchmark import Benchmark, BenchmarkFiles, get_subset from src.corrector.beam_search.batched_beam_search_corrector import BatchedBeamSearchCorrector from src.corrector.beam_search.penalty_holder import PenaltyHolder from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp from src.interactive.sequence_generator import interactive_sequence_generator from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator
import project from src.interactive.parameters import Parameter, ParameterGetter params = [ Parameter("words", "-w", "int"), Parameter("benchmark", "-b", "str"), Parameter("test", "-t", "boolean") ] getter = ParameterGetter(params) getter.print_help() parameters = getter.get() from src.interactive.sequence_generator import interactive_sequence_generator from src.benchmark.benchmark import Benchmark, BenchmarkFiles, Subset from src.evaluation.predictions_file_writer import PredictionsFileWriter from src.helper.time import time_diff, timestamp from src.ngram.unigram_corrector import UnigramCorrector if __name__ == "__main__": n = parameters["words"] n = None if n == -1 else n corrector = UnigramCorrector(n) print("%i words" % len(corrector.holder)) print("%i bigrams" % len(corrector.bigrams)) if parameters["benchmark"] == "0": sequences = interactive_sequence_generator() writer = None else: subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT benchmark = Benchmark(parameters["benchmark"], subset)