Пример #1
0
def test_single_sent():
    #parameters
    max_word_len = 15
    dummy_start, dummy_end = u'$START#', u'$END#'

    print '\nRunning test for exp_l2s_predict....'
    path_me_model = "../working_data/train.set1.i80.model"
    path_to_lexicon = "../working_data/train_testPredict.dict"

    sent = u"材 料 利 用 率 高".split()
    sent = u"下 雨 天 留 客 天 天 留 我 不 留".split()
    print "sample sentence is:", " - ".join(sent)

    #
    # loading maximum entropy model as the score function
    #
    print '\nInitializing maximum entropy model as the scoring model'
    model = ScoreModel(path_me_model)
    print 'done'

    #
    # loading lexicion
    #
    print "\nLoading lexicion file..."
    with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f:
        lexicon = [word for line in f for word in line.split()]
        print "lexicion size=", len(
            lexicon), "example word in lexicion:", "  ".join(lexicon[:5])
        lexicon = set(lexicon)

    print '\n====1 Bui latttice for the sample sentence====='

    forward_unigram_lattice, backward_bigram_lattice = gen_lattice(
        lexicon, sent, max_word_len, dummy_start)

    print '\n====2 Runing Viterbi search to decode===='
    best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice,
                                    sent, dummy_end)
    #b_lattic_display(backward_bigram_lattice)
    #f_lattice_display(forward_unigram_lattice)

    x = best_index_seq[:-1]
    y = best_index_seq[1:]
    z = zip(x, y)

    segmented = []
    for index1, index2 in z:
        word = u"".join(sent[index1:index2])
        print word
        segmented.append(word)

    print '\nSegmented sent=', u" ".join(segmented)
Пример #2
0
def test_single_sent():
    #parameters
    max_word_len = 15
    dummy_start, dummy_end = u'$START#', u'$END#'

    print '\nRunning test for exp_l2s_predict....'
    path_me_model = "../working_data/train.set1.i80.model"
    path_to_lexicon = "../working_data/train_testPredict.dict"

    sent = u"材 料 利 用 率 高".split()
    sent = u"下 雨 天 留 客 天 天 留 我 不 留".split()
    print "sample sentence is:", " - ".join(sent)

    #
    # loading maximum entropy model as the score function
    #
    print '\nInitializing maximum entropy model as the scoring model'
    model = ScoreModel(path_me_model)
    print 'done'


    #
    # loading lexicion
    #
    print "\nLoading lexicion file..."
    with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f:
        lexicon = [word for line in f for word in line.split()]
        print "lexicion size=", len(lexicon), "example word in lexicion:", "  ".join(lexicon[:5])
        lexicon = set(lexicon)

    print '\n====1 Bui latttice for the sample sentence====='

    forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start)

    print '\n====2 Runing Viterbi search to decode===='
    best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end)
    #b_lattic_display(backward_bigram_lattice)
    #f_lattice_display(forward_unigram_lattice)

    x = best_index_seq[:-1]
    y = best_index_seq[1:]
    z = zip(x, y)

    segmented = []
    for index1, index2 in z:
        word = u"".join(sent[index1:index2])
        print word
        segmented.append(word)

    print '\nSegmented sent=', u" ".join(segmented)
Пример #3
0
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output):
    max_word_len = 12
    dummy_start, dummy_end = u'$START#', u'$END#'

    #
    # loading maximum entropy model as the score function
    #
    print '\nInitializing maximum entropy model as the scoring model'
    model = ScoreModel(path_me_model)
    print 'done'

    #
    # loading lexicion
    #
    print "\nLoading lexicion file..."
    with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f:
        lexicon = [word for line in f for word in line.split()]
        print "lexicion size=", len(
            lexicon), "example word in lexicion:", "  ".join(lexicon[:5])
        lexicon = set(lexicon)

    segmented_corpus = []

    print "\nLoading corpus to be segmented..."
    with codecs.open(path_corpus, 'rU', 'utf-8') as f:
        raw_corpus = [u"".join(line.split()) for line in f]
        print 'line count of raw_corpus=', len(raw_corpus)
        print 'the first line is ', raw_corpus[0]

    print "\n\n====Segmenting the corpus======"
    for sent in raw_corpus:

        #print '\n====1 Bui latttice for the sample sentence====='

        forward_unigram_lattice, backward_bigram_lattice = gen_lattice(
            lexicon, sent, max_word_len, dummy_start)

        #print '\n====2 Runing Viterbi search to decode===='
        best_index_seq = viterbi_search(model.score_it,
                                        backward_bigram_lattice, sent,
                                        dummy_end)
        #b_lattic_display(backward_bigram_lattice)
        #f_lattice_display(forward_unigram_lattice)

        x = best_index_seq[:-1]
        y = best_index_seq[1:]
        z = zip(x, y)

        segmented = []
        for index1, index2 in z:
            word = u"".join(sent[index1:index2])
            #print word
            segmented.append(word)

        print '\nSegmented sent=', u" ".join(segmented)
        segmented_corpus.append(u" ".join(segmented))

    print "\nSegmentation done, writing it to file", path_to_output, '...'
    with codecs.open(path_to_output, 'w', 'utf-8') as f:
        for sent in segmented_corpus:
            f.write(sent + u'\n')

    print 'done'
    print "Program exit."
Пример #4
0
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output):
    max_word_len = 12
    dummy_start, dummy_end = u'$START#', u'$END#'


    #
    # loading maximum entropy model as the score function
    #
    print '\nInitializing maximum entropy model as the scoring model'
    model = ScoreModel(path_me_model)
    print 'done'


    #
    # loading lexicion
    #
    print "\nLoading lexicion file..."
    with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f:
        lexicon = [word for line in f for word in line.split()]
        print "lexicion size=", len(lexicon), "example word in lexicion:", "  ".join(lexicon[:5])
        lexicon = set(lexicon)

    segmented_corpus = []

    print "\nLoading corpus to be segmented..."
    with codecs.open(path_corpus, 'rU', 'utf-8') as f:
        raw_corpus = [u"".join(line.split()) for line in f]
        print 'line count of raw_corpus=', len(raw_corpus)
        print 'the first line is ', raw_corpus[0]

    print "\n\n====Segmenting the corpus======"
    for sent in raw_corpus:

        #print '\n====1 Bui latttice for the sample sentence====='

        forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start)

        #print '\n====2 Runing Viterbi search to decode===='
        best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end)
        #b_lattic_display(backward_bigram_lattice)
        #f_lattice_display(forward_unigram_lattice)

        x = best_index_seq[:-1]
        y = best_index_seq[1:]
        z = zip(x, y)

        segmented = []
        for index1, index2 in z:
            word = u"".join(sent[index1:index2])
            #print word
            segmented.append(word)

        print '\nSegmented sent=', u" ".join(segmented)
        segmented_corpus.append(u" ".join(segmented))

    print "\nSegmentation done, writing it to file", path_to_output, '...'
    with codecs.open(path_to_output, 'w', 'utf-8') as f:
        for sent in segmented_corpus:
            f.write(sent + u'\n')

    print 'done'
    print "Program exit."