Пример #1
0
def read_annotation_log(filename):
    io = flatcat.FlatcatIO(encoding='utf-8')
    out = []
    nonwords = []
    # enforces splitting of hyphens and colons
    fs = flatcat.flatcat.ForceSplitter(':-', None)
    with codecs.open(filename, 'r', encoding='utf-8') as fobj:
        for line in fobj:
            line = line.strip()
            parts = line.split('\t')
            if len(parts) < 3:
                print('Cant parse annotation "{}"'.format(line))
            if parts[2] in ('Eval', 'Modified', 'Predicted'):
                analysis = io.read_annotation(
                    parts[1],
                    construction_sep=' ',
                )[0]
                analysis = fs.enforce_one(analysis)
                out.append(Annotation(parts[0], analysis))
            elif parts[2] == 'Nonword':
                nonwords.append(parts[0])

    # multiple analyses for the same surface word
    # are returned as separate Annotations
    return (out, nonwords)
Пример #2
0
def ModelTraining(segmentation_file):
    io = flatcat.FlatcatIO()
    morph_usage = flatcat.categorizationscheme.MorphUsageProperties()
    model = flatcat.FlatcatModel(morph_usage, corpusweight=1.0)
    model.add_corpus_data(io.read_segmentation_file(segmentation_file))
    model.initialize_hmm()
    return model
Пример #3
0
def load_flatcat_model(filename):
    """
    Loads and initializes Flatcat model from tarball archive.
    """
    io = flatcat.FlatcatIO()
    model = io.read_tarball_model_file(filename)
    model.initialize_hmm()
    return model
Пример #4
0
def read_old_annotations(filename):
    io = flatcat.FlatcatIO(encoding='utf-8')
    out = []
    nonwords = []
    with codecs.open(filename, 'r', encoding='utf-8') as fobj:
        for line in fobj:
            line = line.strip()
            parts = line.split('\t')
            if len(parts) < 2:
                print('Cant parse annotation "{}"'.format(line))
            if parts[1] == '!':
                nonwords.append(parts[0])
            else:
                analysis = io.read_annotation(
                    parts[1],
                    construction_sep=' ',
                    analysis_sep=',',
                )
                out.append(Annotation(parts[0], analysis))
    return (out, nonwords)
Пример #5
0
def fit_flatcat_model(datafile, corpusweight=1.0, randomState=None):
    random.seed(randomState)
    io = flatcat.FlatcatIO()

    morph_usage = flatcat.categorizationscheme.MorphUsageProperties()
    model = flatcat.FlatcatModel(morph_usage, corpusweight=corpusweight)

    model.add_corpus_data(io.read_segmentation_file(datafile))
    model.initialize_hmm()

    # from https://github.com/aalto-speech/flatcat/blob/master/flatcat/cmd.py#L755
    ts = arrow.now()
    model.train_batch(
        # Stop training if cost reduction between iterations is below this limit * #boundaries.
        min_iteration_cost_gain=0.0025,
        # Stop training if cost reduction between epochs is below this limit * #boundaries.
        # In semi-supervised training the cost is not monotonous between epochs, so this
        # limit is meaningless.
        min_epoch_cost_gain=None,
        # The number of training epochs.
        max_epochs=4,
        # Maximum number of iterations of each operation in the first epoch.
        max_iterations_first=1,
        # Maximum number of iterations of each operation in the subsequent epochs.
        max_iterations=1,
        # Maximum number of iterations of resegmentation in all epochs.
        max_resegment_iterations=2,
        # Maximum number of iterations of resegmentation in all epochs.
        max_shift_distance=2,
        # Minimum number of letters remaining in the shorter morph after a shift operation.
        min_shift_remainder=2)
    LOG.info('Final cost: {}'.format(model.get_cost()))
    te = arrow.now()
    LOG.info('Training time: {}'.format(te - ts))

    return model
Пример #6
0
def main(argv):
    parser = get_argparser()
    args = parser.parse_args(argv)
    io = flatcat.FlatcatIO(encoding='utf-8')

    prev_iter = args.iteration - 1
    print('Metric: {}, Next iteration: {}, Previous iteration: {}'.format(
        args.metric, args.iteration, prev_iter))
    metric = METRICS[args.metric]()

    if not args.outdir[-1] == '/':
        args.outdir += '/'
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Guessing filenames

    model_filename = args.model
    if model_filename is None:
        print('Trying to guess model file...')
        for filename in os.listdir('models'):
            if not filename.startswith(
                    '{}.flatcat.{}.'.format(prev_iter, args.metric)):
                continue
            if not filename.endswith('.model.tar.gz'):
                continue
            if model_filename is not None:
                raise Exception(
                    'Both "{}" and "{}" match the model pattern'.format(
                        model_filename, filename))
            model_filename = os.path.join(args.modeldir, filename)
        if model_filename is None:
            raise Exception('Model not found')
        print('... guessing "{}"'.format(model_filename))

    nonword_filename = args.oldnonwords
    if nonword_filename is None:
        nonword_filename = os.path.join(
            'annotations',
            '{}.nonword.words'.format(prev_iter))
        print('Nonword file not specified, guessing "{}"'.format(
            nonword_filename))
    if not os.path.exists(nonword_filename):
        print('Nonword file ({}) not found, '
              'assuming all words are valid'.format(
            nonword_filename))
        nonword_filename = None

    oldselected_filename = args.oldselected
    if oldselected_filename is None:
        oldselected_filename = os.path.join(
            'annotations',
            '{}.train.{}.annotated.words'.format(
                prev_iter, args.metric))
        print('Previously selected word file not specified, '
              'guessing "{}"'.format(
            oldselected_filename))
    if not os.path.exists(oldselected_filename):
        print('Previously selected word file ({}) not found, '
              'assuming no selections have been made'.format(
            oldselected_filename))
        oldselected_filename = None

    if args.oldoracle is not None:
        if not os.path.exists(args.oldoracle):
            raise Exception('Old oracle file "{}" not found'.format(
                args.oldoracle))

    if args.overridemetric is not None:
        metric_out = args.overridemetric
    else:
        metric_out = args.metric

    selection_filename = os.path.join(
        args.outdir,
        '{}.train.{}.all.selected'.format(args.iteration, metric_out))
    scores_filename = os.path.join(
        args.outdir,
        '{}.train.{}.all.scores'.format(args.iteration, metric_out))
    unseen_filename = os.path.join(
        args.outdir,
        '{}.train.{}.unseen.selected'.format(args.iteration, metric_out))
    prediction_filename = os.path.join(
        args.outdir,
        '{}.train.{}.unseen.predictions'.format(args.iteration, metric_out))

    # load, initialize, read

    print('Loading model...')
    model = io.read_tarball_model_file(model_filename)
    print('...done')
    model.initialize_hmm()  # FIXME: automate


    if oldselected_filename is not None:
        seen = set(tools.read_wordlist(oldselected_filename))
    else:
        seen = set()

    if nonword_filename is not None:
        nonwords = tools.read_wordlist(nonword_filename)
    else:
        nonwords = []
    seen.update(nonwords)

    if args.oldoracle is not None:
        oracle = set(tools.read_wordlist(args.oldoracle))
    else:
        oracle = set()

    trainpool = next(tools.get_pools(['train'], args.pooldir))

    # already selected words (incl nonwords) cannot be reselected
    trainpool = tools.filter_pool(trainpool, seen)


    # perform selection
    selector = selection.Selector(
        metric, model,
        progress=flatcat.utils._generator_progress)
    if args.configcorpus is not None:
        print('Configuring metric with "{}"'.format(args.configcorpus))
        selector.configure(
            tools.read_wordlist(args.configcorpus),
            seen=seen)
    print('Performing ranking...')
    ranked = selector.rank(trainpool, seen=seen, n=args.num_annots)
    print('...done')

    # write scores (debug)
    selection.write_scores(ranked, scores_filename)

    # apply representative sampling, if needed
    if args.representative is not None and args.representative > 0:
        print('Performing representative sampling...')
        from morphsegannot.tools import representative
        truncated = [item.word for item in ranked[:args.representative]]
        selected = representative.representative_sampling(
            truncated, args.num_annots)
    else:
        selected = [item.word for item in ranked[:args.num_annots]]

    # write
    with codecs.open(selection_filename, 'w', encoding='utf-8') as selfobj:
        with codecs.open(unseen_filename, 'w', encoding='utf-8') as unfobj:
            with codecs.open(prediction_filename, 'w', encoding='utf-8') as prfobj:
                for word in selected:
                    selfobj.write('{}\n'.format(word))
                    if word not in oracle:
                        unfobj.write('{}\n'.format(word))
                        (morphs, _) = model.viterbi_segment(word)
                        prfobj.write('{}\t{}\n'.format(
                            word, ' + '.join(morphs)))
Пример #7
0
def save_flatcat_model(filename, model):
    io = flatcat.FlatcatIO()
    io.write_tarball_model_file(filename, model)
Пример #8
0
def main(argv):
    parser = get_argparser()
    args = parser.parse_args(argv)
    io = flatcat.FlatcatIO(encoding='utf-8')

    if not args.outdir[-1] == '/':
        args.outdir += '/'
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # single model overriding the metric-specific ones
    overridemodel = None
    if not args.overridemodel is None:
        print('Loading overridemodel...')
        overridemodel = io.read_tarball_model_file(args.overridemodel)
        print('...done')
        overridemodel.initialize_hmm()  # FIXME: automate

    nonword_filename = os.path.join(args.annotsdir,
                                    '{}.nonword.words'.format(args.iteration))
    if not os.path.exists(nonword_filename):
        print('No nonword file ({}), assuming all words are valid'.format(
            nonword_filename))
        nonword_filename = None

    for metric_name in args.metrics:
        print('Metric: {}'.format(metric_name))
        metric = METRICS[metric_name]()
        # workaround for metric needing high and low models
        if metric_name.startswith('alphabracket'):
            metric.set_models(
                io.read_tarball_model_file(
                    os.path.join(
                        args.modeldir, '{}.flatcat.{}_low.model.tar.gz'.format(
                            args.iteration, 'alphabracket'))),
                io.read_tarball_model_file(
                    os.path.join(
                        args.modeldir, '{}.flatcat.{}_hi.model.tar.gz'.format(
                            args.iteration, 'alphabracket'))))

        if args.overrideseen is None:
            annot_filename = os.path.join(
                args.annotsdir,
                '{}.train.{}.annotated.words'.format(args.iteration - 1,
                                                     metric_name))
        else:
            annot_filename = args.overrideseen
        model_filename = None
        for filename in os.listdir(args.modeldir):
            if not filename.startswith('{}.flatcat.{}.'.format(
                    args.iteration, metric_name)):
                continue
            if not filename.endswith('.model.tar.gz'):
                continue
            if model_filename is not None:
                raise Exception(
                    'Both "{}" and "{}" match the model pattern'.format(
                        model_filename, filename))
            model_filename = os.path.join(args.modeldir, filename)
        if model_filename is None and args.overridemodel is None:
            raise Exception(
                'Model for metric "{}" not found'.format(metric_name))
        selection_filename = os.path.join(
            args.outdir, '{}.train.{}.selected'.format(args.iteration,
                                                       metric_name))
        scores_filename = os.path.join(
            args.outdir, '{}.train.{}.scores'.format(args.iteration,
                                                     metric_name))

        if os.path.exists(annot_filename):
            seen = set(tools.read_wordlist(annot_filename))
        else:
            print('No annotations file ({})'.format(annot_filename))
            seen = set()

        if not nonword_filename is None:
            nonwords = tools.read_wordlist(nonword_filename)
        else:
            nonwords = []
        seen.update(nonwords)
        if not overridemodel is None:
            model = overridemodel
        elif metric_name.startswith('alphabracket'):
            model = None
        else:
            model = io.read_tarball_model_file(model_filename)
            model.initialize_hmm()
        trainpool = tools.get_pools(['train'], args.pooldir).next()

        # already selected words (incl nonwords) cannot be reselected
        trainpool = tools.filter_pool(trainpool, seen)

        # perform selection
        selector = selection.Selector(
            metric, model, progress=flatcat.utils._generator_progress)
        if args.configcorpus is not None:
            print('Configuring metric with "{}"'.format(args.configcorpus))
            selector.configure(tools.read_wordlist(args.configcorpus),
                               seen=seen)
        print('Performing ranking...')
        ranked = selector.rank(trainpool, seen=seen, n=args.num_annots)
        print('...done')

        # write
        selection.write_selected(ranked, selection_filename, args.num_annots)
        selection.write_scores(ranked, scores_filename)