Exemplo n.º 1
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train segmentation model')
    parser.add_argument('-i', '--n_iter', type=int, required=True,
            help='Number of iterations')
    parser.add_argument('--alpha_p', type=float, default=0.001,
            help='Smoothing parameter for prefix Dirichlet prior')
    parser.add_argument('--alpha_s', type=float, default=0.001,
            help='Smoothing parameter for suffix Dirichlet prior')
    parser.add_argument('--strength', '-t', type=float, default=1e-6,
            help='DP prior stength')
    parser.add_argument('--collapse', action='store_true')
    args = parser.parse_args()

    word_vocabulary = Vocabulary(start_stop=False)
    corpus = [word_vocabulary[line.decode('utf8').strip()] for line in sys.stdin]

    # Compute all the possible prefixes, suffixes
    prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary)

    logging.info('%d tokens / %d types / %d prefixes / %d suffixes',
            len(corpus), len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary))

    model = SegmentationModel(args.strength, args.alpha_p, args.alpha_s,
            word_vocabulary, prefix_vocabulary, suffix_vocabulary, args.collapse)

    t_start = time.time()
    run_sampler(model, args.n_iter, corpus)
    runtime = time.time() - t_start
    logging.info('Sampler ran for %f seconds', runtime)

    show_analyses(model)
Exemplo n.º 2
0
def main():
    logging.basicConfig(level=logging.INFO)
    # Read the training corpus
    word_vocabulary = Vocabulary(start_stop=False)
    analyses = {}
    for line in sys.stdin:
        word, analysis, _ = line.decode('utf8').split('\t')
        morphemes = analysis.split('+')
        if len(morphemes) not in (1, 2):
            raise Exception('wtf?')
        prefix = morphemes[0]
        suffix = '' if len(morphemes) == 1 else morphemes[1]
        word_vocabulary[word]
        analyses[word] = (prefix, suffix)

    # Compute all the possible prefixes, suffixes
    prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary)

    logging.info('%d types / %d prefixes / %d suffixes',
            len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary))

    prefix_counts = Counter()
    suffix_counts = Counter()
    for word, (prefix, suffix) in analyses.iteritems():
        prefix_counts[prefix_vocabulary[prefix]] += 1
        suffix_counts[suffix_vocabulary[suffix]] += 1

    ## The base
    base = MultinomialProduct(len(prefix_vocabulary), 0.001, len(suffix_vocabulary), 0.001)
    ## Updating the counts
    base.update(prefix_counts, suffix_counts)

    print base.log_likelihood()
Exemplo n.º 3
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train segmentation model')
    parser.add_argument('-i', '--n_iter', type=int, required=True,
                        help='Number of iterations')
    parser.add_argument('--alpha_p', type=float, default=0.001,
                        help='Smoothing parameter for prefix Dirichlet prior')
    parser.add_argument('--alpha_s', type=float, default=0.001,
                        help='Smoothing parameter for suffix Dirichlet prior')
    parser.add_argument('--strength', '-t', type=float, default=1e-6,
                        help='DP prior stength')
    parser.add_argument('--processors', '-p', type=int, default=mp.cpu_count(),
                        help='Number of processors to use')
    parser.add_argument('-mh', type=int, required=True,
                        help='Number of MH steps per global iteration')
    parser.add_argument('--collapse', action='store_true',
                        help='Use approximate collapsed base')
    args = parser.parse_args()

    word_vocabulary = Vocabulary(start_stop=False)
    corpus = [word_vocabulary[line.decode('utf8').strip()] for line in sys.stdin]
    prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary)

    logging.info('%d tokens / %d types / %d prefixes / %d suffixes',
                 len(corpus), len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary))

    logging.info('Starting %d processes', args.processors)
    model = ParallelSegmentationModel(args.strength, args.alpha_p, args.alpha_s, corpus, word_vocabulary,
                                      prefix_vocabulary, suffix_vocabulary, args.processors, args.mh, args.collapse)

    t_start = time.time()
    run_sampler(model, args.n_iter)
    runtime = time.time() - t_start
    logging.info('Sampler ran for %f seconds', runtime)

    model.shutdown()

    for word in word_vocabulary:
        p, s = model.decode(word)
        print(u'{}\t{}\t{}'.format(word, p, s).encode('utf8'))