def test(): from lattice_build import gen_lattice print '\n=== Running viterbi Search Test...' sent = u"材 料 利 用 率 高".split() word_list = [u'材料', u'利用', u'利用率', u'率', u'高'] max_word_len = 3 dummy_start = u'$START#' dummy_end = u"$END#" forward_lattice, backward_lattice = gen_lattice(word_list, sent, max_word_len, dummy_start) # scoring_model = u'I am a model' best_index_seq = viterbi_search(score_it, backward_lattice, sent, dummy_end) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) for index1, index2 in z: print forward_lattice[index1][index2] print "*".join(sent[index1:index2])
def test_single_sent(): #parameters max_word_len = 15 dummy_start, dummy_end = u'$START#', u'$END#' print '\nRunning test for exp_l2s_predict....' path_me_model = "../working_data/train.set1.i80.model" path_to_lexicon = "../working_data/train_testPredict.dict" sent = u"材 料 利 用 率 高".split() sent = u"下 雨 天 留 客 天 天 留 我 不 留".split() print "sample sentence is:", " - ".join(sent) # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len( lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice( lexicon, sent, max_word_len, dummy_start) print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented)
def test_single_sent(): #parameters max_word_len = 15 dummy_start, dummy_end = u'$START#', u'$END#' print '\nRunning test for exp_l2s_predict....' path_me_model = "../working_data/train.set1.i80.model" path_to_lexicon = "../working_data/train_testPredict.dict" sent = u"材 料 利 用 率 高".split() sent = u"下 雨 天 留 客 天 天 留 我 不 留".split() print "sample sentence is:", " - ".join(sent) # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len(lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start) print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented)
def core(parameter_tuple): sent, word_list, max_word_len, dummy_start, dummy_end = parameter_tuple raw_sent =u"".join(sent) f_lattice, b_lattice = gen_lattice(word_list, raw_sent, max_word_len, dummy_start) valid_state = gen_valid_state(sent, dummy_start, dummy_end) instances = gen_instance_by_traversal_lattice(valid_state, b_lattice, raw_sent, dummy_end) return instances
def test(): word_set = [u'材料', u'利用', u'利用率', u'率', u'高'] word_seq = [u'材料', u'利用率', u'高'] raw_sent = u"".join(word_seq) max_word_len = 3 dummy_start, dummy_end = u'$START#', u'$END#' f_lattice, b_lattice = gen_lattice(word_set, raw_sent, max_word_len, dummy_start) b_lattic_display(b_lattice) valid_state = gen_valid_state(word_seq, dummy_start, dummy_end) instance = gen_instance_by_traversal_lattice(valid_state, b_lattice, raw_sent, dummy_end) print '\n\n===== Display all instances =====' for i in instance: print i[0], u"/".join(i[1])
def test(): from lattice_build import gen_lattice print "\n=== Running viterbi Search Test..." sent = u"材 料 利 用 率 高".split() word_list = [u"材料", u"利用", u"利用率", u"率", u"高"] max_word_len = 3 dummy_start = u"$START#" dummy_end = u"$END#" forward_lattice, backward_lattice = gen_lattice(word_list, sent, max_word_len, dummy_start) # scoring_model = u'I am a model' best_index_seq = viterbi_search(score_it, backward_lattice, sent, dummy_end) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) for index1, index2 in z: print forward_lattice[index1][index2] print "*".join(sent[index1:index2])
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output): max_word_len = 12 dummy_start, dummy_end = u'$START#', u'$END#' # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len( lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) segmented_corpus = [] print "\nLoading corpus to be segmented..." with codecs.open(path_corpus, 'rU', 'utf-8') as f: raw_corpus = [u"".join(line.split()) for line in f] print 'line count of raw_corpus=', len(raw_corpus) print 'the first line is ', raw_corpus[0] print "\n\n====Segmenting the corpus======" for sent in raw_corpus: #print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice( lexicon, sent, max_word_len, dummy_start) #print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) #print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented) segmented_corpus.append(u" ".join(segmented)) print "\nSegmentation done, writing it to file", path_to_output, '...' with codecs.open(path_to_output, 'w', 'utf-8') as f: for sent in segmented_corpus: f.write(sent + u'\n') print 'done' print "Program exit."
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output): max_word_len = 12 dummy_start, dummy_end = u'$START#', u'$END#' # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len(lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) segmented_corpus = [] print "\nLoading corpus to be segmented..." with codecs.open(path_corpus, 'rU', 'utf-8') as f: raw_corpus = [u"".join(line.split()) for line in f] print 'line count of raw_corpus=', len(raw_corpus) print 'the first line is ', raw_corpus[0] print "\n\n====Segmenting the corpus======" for sent in raw_corpus: #print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start) #print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) #print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented) segmented_corpus.append(u" ".join(segmented)) print "\nSegmentation done, writing it to file", path_to_output, '...' with codecs.open(path_to_output, 'w', 'utf-8') as f: for sent in segmented_corpus: f.write(sent + u'\n') print 'done' print "Program exit."