import project

from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter("model_name", "-m", "str"),
    Parameter("labeling", "-labeling", "str"),
    Parameter("benchmark", "-b", "str"),
    Parameter("sequences", "-seq", "str"),
    Parameter("lookahead", "-l", "int")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import numpy as np

from src.helper.pickle import load_object
from src.helper.data_structures import izip
from src.settings import paths
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.corrector.beam_search.penalty_fitter import Case as CaseLabel, PenaltyFitter
from src.corrector.beam_search.penalty_holder import PenaltyHolder

if __name__ == "__main__":
    model_name = parameters["model_name"]
    benchmark_name = parameters["benchmark"]
    lookahead = parameters["lookahead"]
    sequence_file = parameters["sequences"]
    cases_path = paths.CASES_FILE_NOISY if benchmark_name.startswith(
        "0.1") else paths.CASES_FILE_CLEAN
import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("model_name", "-name", "str"),
    Parameter("noise", "-noise", "float"),
    Parameter("start_batch", "-start", "int")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import tensorflow as tf

from src.encoding.character_encoder import get_encoder
from src.data_fn.robust_data_fn_provicer import RobustDataFnProvider
from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimatorSpecification, UnidirectionalLMEstimator

if __name__ == "__main__":
    tf.logging.set_verbosity(tf.logging.INFO)

    name = parameters["model_name"]

    vocab_size = 200
    recurrent_units = [1024]
    dense_units = [1024]
    seq_len = 256
    batch_size = 128
    noise = parameters["noise"]
    start_batch = parameters["start_batch"]
from project import src

from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("model_name", "-name", "str"),
    Parameter("direction",
              "-dir",
              "str",
              help_message="Choose from {fwd, bwd, bidir}.",
              dependencies=[("bidir",
                             Parameter("sigmoidal", "-sigmoid", "boolean"))]),
    Parameter("vocabulary", "-voc", "str"),
    Parameter("recurrent_units", "-ru", "int"),
    Parameter("dense_units", "-du", "int"),
    Parameter("dataset", "-data", "str"),
    Parameter("epochs", "-e", "int", default=1),
    Parameter("batch_size", "-bs", "int"),
    Parameter("sequence_length", "-len", "int"),
    Parameter("noise", "-noise", "str"),
    Parameter("start_batch", "-start", "int"),
    Parameter("steps", "-steps", "int"),
    Parameter("keep_n_checkpoints", "-keep", "int"),
    Parameter("keep_every_hours", "-every", "int")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import tensorflow as tf
示例#4
0
import project
from src.interactive.parameters import Parameter, ParameterGetter


params = [Parameter("model_name", "-m", "str"),
          Parameter("benchmark", "-b", "str"),
          Parameter("n_sequences", "-n", "int")]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()


from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimator
from src.helper.pickle import dump_object, load_object
from src.settings import paths
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.corrector.beam_search.penalty_tuning import Case


if __name__ == "__main__":
    model_name = parameters["model_name"]
    path = paths.CASES_FILE_CLEAN
    path = path % (model_name, parameters["benchmark"])
    benchmark_name = parameters["benchmark"]
    n_sequences = parameters["n_sequences"]

    LOOKAHEAD = 2

    model = UnidirectionalLMEstimator()
    model.load(model_name)
import project
from src.interactive.parameters import Parameter, ParameterGetter


params = [
    Parameter("model_type", "-type", "str",
              help_message="Choose from {fwd, bwd, bidir, combined, softmax_old, sigmoid_old, combined_old}.",
              dependencies=[
                  ("combined", [Parameter("bwd_model_name", "-bwd", "str")]),
                  ("combined_old", [Parameter("bwd_model_name", "-bwd", "str")])
              ]),
    Parameter("model_name", "-name", "str"),
    Parameter("noise_type", "-noise", "str"),
    Parameter("benchmark", "-benchmark", "str"),
    Parameter("out_file", "-f", "str"),
    Parameter("initialize", "-init", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()


from src.load.load_char_lm import load_char_lm
from src.corrector.threshold_holder import ThresholdHolder
from src.corrector.greedy_corrector import GreedyCorrector
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.interactive.sequence_generator import interactive_sequence_generator
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import timestamp, time_diff

import project
from src.interactive.parameters import Parameter, ParameterGetter


params = [Parameter("model_name", "-name", "str"),
          Parameter("vocabulary", "-voc", "str"),
          Parameter("dataset", "-data", "str"),
          Parameter("noise", "-noise", "str"),
          Parameter("batch_size", "-bs", "int"),
          Parameter("epochs", "-e", "int"),
          Parameter("start_batch", "-start", "int")]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()


import tensorflow as tf

from src.encoding.character_encoder import get_encoder, get_arxiv_encoder, get_mixed_encoder
from src.data_fn.robust_data_fn_provicer import RobustDataFnProvider
from src.data_fn.acl_robust_data_fn_provider import ACLRobustDataFnProvider
from src.data_fn.arxiv_robust_data_fn_provider import ArxivRobustDataFnProvider
from src.data_fn.file_reader_robust_data_fn_provider import FileReaderRobustDataFnProvider
from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator, \
    BidirectionalLabelingEstimatorSpecification
from src.noise.token_typo_inducer import TokenTypoInducer
from src.noise.ocr_noise_inducer import OCRNoiseInducer
from src.noise.char_and_punctuation_noise_inducer import CharAndPunctuationNoiseInducer
from src.noise.acl_noise_inducer import ACLNoiseInducer

示例#7
0
import project
from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter("n_tokens", "-n", "int"),
    Parameter("benchmark", "-b", "str"),
    Parameter("test", "-t", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.corrector.baselines.maximum_matching_corrector import MaximumMatchingCorrector
from src.interactive.sequence_generator import interactive_sequence_generator
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    n_tokens = parameters["n_tokens"]
    corrector = MaximumMatchingCorrector(n=n_tokens)

    benchmark_name = parameters["benchmark"]
    if benchmark_name == "0":
        sequences = interactive_sequence_generator()
        file_writer = None
    else:
        subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT
        benchmark = Benchmark(benchmark_name, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        file = benchmark.get_results_directory(
示例#8
0
from project import src
from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter("name", "-name", "str"),
    Parameter("corruption_probability", "-p", "float"),
    Parameter("noise_probability", "-noise", "float")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.datasets.wikipedia import Wikipedia
from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.noise.typo_noise_inducer import TypoNoiseInducer
from src.sequence.token_corruptor import TokenCorruptor
from src.settings import constants
from src.helper.files import open_file

if __name__ == "__main__":
    SEED = 3010

    p = parameters["corruption_probability"]

    # create empty benchmarks
    benchmark_name = parameters["name"]
    development_benchmark = Benchmark(benchmark_name, Subset.DEVELOPMENT)
    test_benchmark = Benchmark(benchmark_name, Subset.TEST)

    # read sequences
    development_sequences = Wikipedia.development_sequences()
示例#9
0
from project import src
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("model_type",
              "-type",
              "str",
              help_message="Choose 'bidir' or 'combined'.",
              dependencies=[
                  ("bidir", [
                      Parameter("model_name",
                                "-name",
                                "str",
                                help_message="Name of the model.")
                  ]),
                  ("combined", [
                      Parameter("fwd_model_name",
                                "-fwd",
                                "str",
                                help_message="Name of the forward model."),
                      Parameter("bwd_model_name",
                                "-bwd",
                                "str",
                                help_message="Name of the backward model.")
                  ])
              ]),
    Parameter("benchmark", "-benchmark", "str"),
    Parameter("noise", "-noise", "str"),
    Parameter("insert", "-insert", "boolean"),
    Parameter("initialize", "-init", "boolean"),
    Parameter("threshold", "-t", "float")
import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("approach", "-a", "str"),
    Parameter("noise_level", "-n", "float"),
    Parameter("two_pass", "-tp", "str")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from enum import Enum
import numpy as np

from src.load.load_char_lm import load_default_char_lm
from src.benchmark.benchmark import Subset, BenchmarkFiles, get_benchmark
from src.benchmark.two_pass_benchmark import get_two_pass_benchmark
from src.sequence.transformation import space_corruption_positions
from src.optimization.threshold import optimal_f1_threshold
from src.corrector.threshold_holder import ThresholdHolder, FittingMethod, ThresholdType


class OperationType(Enum):
    INSERTION = 0
    DELETION = 1


class PredictionType(Enum):
    FALSE_POSITIVE = 0
    TRUE_POSITIVE = 1
import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("benchmark", "-benchmark", "str"),
    Parameter("subset", "-set", "str"),
    Parameter("n_sequences", "-n", "int"),
    Parameter("n_beams", "-b", "int"),
    Parameter("space_penalty", "-sp", "float"),
    Parameter("char_penalty", "-cp", "float"),
    Parameter("out_file", "-f", "str"),
    Parameter("segmentation_file", "-seg", "str"),
    Parameter("continue", "-c", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.interactive.sequence_generator import interactive_sequence_generator
from src.estimator.unidirectional_lm_estimator import UnidirectionalLMEstimator
from src.spelling.spelling_beam_search_corrector import SpellingBeamSearchCorrector
from src.benchmark.benchmark import BenchmarkFiles, SUBSETS, Benchmark
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp

if __name__ == "__main__":
    model = UnidirectionalLMEstimator()
    model.load("fwd1024")
    corrector = SpellingBeamSearchCorrector(
        model,
        n_beams=parameters["n_beams"],
import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("noise_level", "-n", "float"),
    Parameter("approach", "-a", "str")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import numpy as np

from src.evaluation.results_holder import ResultsHolder, Metric
from src.benchmark.benchmark import get_benchmark_name, get_error_probabilities, Subset

if __name__ == "__main__":
    approach = parameters["approach"]
    holder = ResultsHolder()
    metrics = [Metric.F1, Metric.SEQUENCE_ACCURACY, Metric.MEAN_RUNTIME]
    values = {metric: [] for metric in metrics}
    for p in get_error_probabilities():
        benchmark_name = get_benchmark_name(parameters["noise_level"], p)
        benchmark_values = []
        for metric in metrics:
            value = holder.get(benchmark_name, Subset.TEST, approach, metric)
            benchmark_values.append(value)
            values[metric].append(value)
        print_name = benchmark_name[:7]
        print_name += ' ' * (7 - len(print_name))
        print(print_name, ' '.join(str(value) for value in benchmark_values))
示例#13
0
import project
from src.interactive.parameters import Parameter, ParameterGetter
import matplotlib.gridspec as gridspec

params = [
    Parameter("metric",
              "-m",
              "str",
              help_message="Choose metric(s) from {f1, acc, t}."),
    Parameter("approaches", "-a", "str"),
    Parameter("noise_level", "-n", "float"),
    Parameter("file_name", "-o", "str")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import numpy as np
import matplotlib.pyplot as plt

from src.evaluation.results_holder import Metric, ResultsHolder
from src.benchmark.benchmark import Subset, get_benchmark_name, get_error_probabilities

METRICS = {
    "f1": Metric.F1,
    "acc": Metric.SEQUENCE_ACCURACY,
    "t": Metric.MEAN_RUNTIME
}

Y_LABELS = {
    Metric.F1: "F-score",
import project
from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter(
        "model",
        "-m",
        "str",
        help_message=
        "Choose a model out of {combined, combined_robust, softmax, softmax_robust, sigmoid, "
        "sigmoid_robust}."),
    Parameter("benchmark",
              "-b",
              "str",
              dependencies=[("0", [Parameter("noise_type", "-n", "str")])]),
    Parameter("test", "-t", "boolean"),
    Parameter("two_pass", "-tp", "str"),
    Parameter("continue", "-c", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.benchmark.benchmark import Benchmark, Subset, BenchmarkFiles
from src.benchmark.two_pass_benchmark import TwoPassBenchmark
from src.corrector.iterative_window_corrector import IterativeWindowCorrector
from src.corrector.threshold_holder import ThresholdHolder, FittingMethod
from src.load.load_char_lm import load_default_char_lm
from src.interactive.sequence_generator import interactive_sequence_generator
from src.helper.time import time_diff, timestamp
from src.evaluation.predictions_file_writer import PredictionsFileWriter
示例#15
0
"""
Evaluates a language model's performance.
"""

from project import src
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("model_type",
              "-type",
              "str",
              help_message="Choose from {fwd, bwd, bidir, combined}.",
              dependencies=[("combined",
                             [Parameter("bwd_model_name", "-bwd", "str")])]),
    Parameter("model_name", "-name", "str"),
    Parameter(
        "benchmark",
        "-b",
        "str",
        help_message=
        "Choose from {new, old}. 'old' is the project dev set and 'new' the paper dev set."
    ),
    Parameter("n_sequences", "-n", "int"),
    Parameter("interactive", "-i", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.datasets.wikipedia import Wikipedia
from src.load.load_char_lm import load_char_lm
示例#16
0
import project
from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter("bigram_postprocessing", "-bi", "boolean"),
    Parameter("interactive", "-i", "boolean"),
    Parameter("test", "-t", "boolean"),
    Parameter("file_name", "-f", "str"),
    Parameter("verbose", "-v", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

import numpy as np

from src.interactive.sequence_generator import interactive_sequence_generator
from src.baselines.dynamic_programming import DynamicProgrammingCorrector
from src.benchmark.benchmark import Subset, BenchmarkFiles, NOISE_LEVELS, get_benchmark
from src.helper.time import time_diff, timestamp
from src.evaluation.predictions_file_writer import PredictionsFileWriter

if __name__ == "__main__":
    corrector = DynamicProgrammingCorrector(
        bigram_postprocessing=parameters["bigram_postprocessing"],
        allow_len_1=False,
        minimize_token_number=False)

    if parameters["interactive"]:
        benchmarks = [None]
    else:
from project import src
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("benchmark", "-b", "str", help_message="Name of the benchmark."),
    Parameter("set", "-set", "str"),
    Parameter("sequences", "-n", "int"),
    Parameter("file",
              "-f",
              "str",
              help_message="Name of the file containing predicted sequences.")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.benchmark.benchmark import Benchmark, BenchmarkFiles, SUBSETS
from src.evaluation.evaluator import Evaluator
from src.helper.data_structures import izip
from src.evaluation.print_methods import print_evaluator

if __name__ == "__main__":
    benchmark_name = parameters["benchmark"]
    benchmark_subset = SUBSETS[parameters["set"]]
    benchmark = Benchmark(benchmark_name, subset=benchmark_subset)

    sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)
    if parameters["file"] == "corrupt.txt":
        predicted_sequences = [corrupt for _, corrupt in sequence_pairs]
    else:
        predicted_sequences = benchmark.get_predicted_sequences(
from project import src
from src.interactive.parameters import ParameterGetter, Parameter

params = [
    Parameter("benchmark",
              "-benchmark",
              "str",
              help_message="Name of the benchmark."),
    Parameter("n_sequences", "-n", "int"),
    Parameter("model_name",
              "-model",
              "str",
              help_message="Name of the unidirectional model to be used."),
    Parameter("backward",
              "-bwd",
              "boolean",
              help_message="Set 1 if the model is a backward model."),
    Parameter("n_beams",
              "-b",
              "int",
              help_message="Number of beams for beam search."),
    Parameter("average_log_likelihood",
              "-avg",
              "boolean",
              help_message="Set 1 to divide beam scores by sequence length."),
    Parameter(
        "penalty",
        "-p",
        "str",
        help_message="Penalty for edits, on probability scale between 0 and 1."
        + " Type a benchmark name to use pre-optimized threshold."),
import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("model", "-m", "str"),
    Parameter("benchmark", "-b", "str"),
    Parameter("subset", "-set", "str"),
    Parameter("sequences", "-seq", "str"),
    Parameter("n_sequences", "-n", "int"),
    Parameter("continue", "-c", "boolean"),
    Parameter("beams", "-w", "int"),
    Parameter("penalties", "-p", "str"),
    Parameter("penalty_modifier", "-pm", "str"),
    Parameter("out_file", "-f", "str"),
    Parameter("labeling_model", "-labeling", "str"),
    Parameter("lookahead", "-l", "int")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.models.char_lm.unidirectional_model import UnidirectionalModel
from src.benchmark.benchmark import Benchmark, BenchmarkFiles, get_subset
from src.corrector.beam_search.batched_beam_search_corrector import BatchedBeamSearchCorrector
from src.corrector.beam_search.penalty_holder import PenaltyHolder
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp
from src.interactive.sequence_generator import interactive_sequence_generator
from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator

import project
from src.interactive.parameters import Parameter, ParameterGetter

params = [
    Parameter("words", "-w", "int"),
    Parameter("benchmark", "-b", "str"),
    Parameter("test", "-t", "boolean")
]
getter = ParameterGetter(params)
getter.print_help()
parameters = getter.get()

from src.interactive.sequence_generator import interactive_sequence_generator
from src.benchmark.benchmark import Benchmark, BenchmarkFiles, Subset
from src.evaluation.predictions_file_writer import PredictionsFileWriter
from src.helper.time import time_diff, timestamp
from src.ngram.unigram_corrector import UnigramCorrector

if __name__ == "__main__":
    n = parameters["words"]
    n = None if n == -1 else n
    corrector = UnigramCorrector(n)
    print("%i words" % len(corrector.holder))
    print("%i bigrams" % len(corrector.bigrams))

    if parameters["benchmark"] == "0":
        sequences = interactive_sequence_generator()
        writer = None
    else:
        subset = Subset.TEST if parameters["test"] else Subset.DEVELOPMENT
        benchmark = Benchmark(parameters["benchmark"], subset)