def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train segmentation model') parser.add_argument('-i', '--n_iter', type=int, required=True, help='Number of iterations') parser.add_argument('--alpha_p', type=float, default=0.001, help='Smoothing parameter for prefix Dirichlet prior') parser.add_argument('--alpha_s', type=float, default=0.001, help='Smoothing parameter for suffix Dirichlet prior') parser.add_argument('--strength', '-t', type=float, default=1e-6, help='DP prior stength') parser.add_argument('--collapse', action='store_true') args = parser.parse_args() word_vocabulary = Vocabulary(start_stop=False) corpus = [word_vocabulary[line.decode('utf8').strip()] for line in sys.stdin] # Compute all the possible prefixes, suffixes prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary) logging.info('%d tokens / %d types / %d prefixes / %d suffixes', len(corpus), len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary)) model = SegmentationModel(args.strength, args.alpha_p, args.alpha_s, word_vocabulary, prefix_vocabulary, suffix_vocabulary, args.collapse) t_start = time.time() run_sampler(model, args.n_iter, corpus) runtime = time.time() - t_start logging.info('Sampler ran for %f seconds', runtime) show_analyses(model)
def main(): logging.basicConfig(level=logging.INFO) # Read the training corpus word_vocabulary = Vocabulary(start_stop=False) analyses = {} for line in sys.stdin: word, analysis, _ = line.decode('utf8').split('\t') morphemes = analysis.split('+') if len(morphemes) not in (1, 2): raise Exception('wtf?') prefix = morphemes[0] suffix = '' if len(morphemes) == 1 else morphemes[1] word_vocabulary[word] analyses[word] = (prefix, suffix) # Compute all the possible prefixes, suffixes prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary) logging.info('%d types / %d prefixes / %d suffixes', len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary)) prefix_counts = Counter() suffix_counts = Counter() for word, (prefix, suffix) in analyses.iteritems(): prefix_counts[prefix_vocabulary[prefix]] += 1 suffix_counts[suffix_vocabulary[suffix]] += 1 ## The base base = MultinomialProduct(len(prefix_vocabulary), 0.001, len(suffix_vocabulary), 0.001) ## Updating the counts base.update(prefix_counts, suffix_counts) print base.log_likelihood()
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train segmentation model') parser.add_argument('-i', '--n_iter', type=int, required=True, help='Number of iterations') parser.add_argument('--alpha_p', type=float, default=0.001, help='Smoothing parameter for prefix Dirichlet prior') parser.add_argument('--alpha_s', type=float, default=0.001, help='Smoothing parameter for suffix Dirichlet prior') parser.add_argument('--strength', '-t', type=float, default=1e-6, help='DP prior stength') parser.add_argument('--processors', '-p', type=int, default=mp.cpu_count(), help='Number of processors to use') parser.add_argument('-mh', type=int, required=True, help='Number of MH steps per global iteration') parser.add_argument('--collapse', action='store_true', help='Use approximate collapsed base') args = parser.parse_args() word_vocabulary = Vocabulary(start_stop=False) corpus = [word_vocabulary[line.decode('utf8').strip()] for line in sys.stdin] prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary) logging.info('%d tokens / %d types / %d prefixes / %d suffixes', len(corpus), len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary)) logging.info('Starting %d processes', args.processors) model = ParallelSegmentationModel(args.strength, args.alpha_p, args.alpha_s, corpus, word_vocabulary, prefix_vocabulary, suffix_vocabulary, args.processors, args.mh, args.collapse) t_start = time.time() run_sampler(model, args.n_iter) runtime = time.time() - t_start logging.info('Sampler ran for %f seconds', runtime) model.shutdown() for word in word_vocabulary: p, s = model.decode(word) print(u'{}\t{}\t{}'.format(word, p, s).encode('utf8'))