예제 #1
0
import runner
import mark
from nltk.tokenize import sent_tokenize

model_prefix = 'model/current'
encdec, opt, conf = runner.load(model_prefix)

def split(source_text):
    return sent_tokenize(source_text)

def predict(source_text):
    sources   = split(source_text)
    sentences = []
    for source in sources:
        batch, hyp  = runner.predict(conf, encdec, source)
        x           = conf.corpus.tokenize(source, cleanup_tag=False)
        t, y        = hyp[0]
        annotations = mark.decoded_vec_to_hash(y)
        result      = []
        for i in range(len(x)):
            result.append({
                "source": x[i],
                "annotation": annotations[i]
            })
        sentences.append(result)
    return sentences
예제 #2
0
    this file collects and runs ``importer``'s testsuite.

    :author: Sam Gammon <*****@*****.**>
    :license: This software follows the MIT (OSI-approved)
              license for open source software. A truncated
              version is included here; for full licensing
              details, see ``LICENSE.md`` in the root directory
              of the project.

              Copyright (c) 2013, Keen IO

              The above copyright notice and this permission notice shall be included in
              all copies or substantial portions of the Software.

              THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
              IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
              FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
              AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
              LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
              OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
              THE SOFTWARE.

'''

# testrunner
import runner

## Run the testsuite! :)
runner.fix_path()  # fix sys.path
runner.run(runner.load())
예제 #3
0
파일: main.py 프로젝트: ttakamura/en_marker
sys.path.append('lib')
import corpus
import config
import runner
import mark

conf = config.parse_args()
corpus = conf.open_corpus()

if conf.mode() == 'console':
    embed()
elif conf.mode() == 'train':
    train_scores, test_scores = runner.train(conf)
    runner.report_bleu_graph(train_scores, test_scores)
elif conf.mode() == 'restore_console':
    encdec, opt, conf = runner.load(conf.load_prefix())
    embed()
    # usage: ---------------------------------------------------------------
    # source = "this is a pen."
    # batch, hyp = runner.predict(conf, encdec, source)
    # x = batch.data_at(0)
    # t, y = hyp[0]
    # mark.decoded_vec_to_str(y)
    #
    # In [24]: corpus.tokenize(source, cleanup_tag=False)
    # Out[24]: [u'<bos>', u'this', u'is', u'a', u'pen', u'.', u'<eos>']
    #
    # In [20]: corpus.ids_to_tokens(x)
    # Out[20]: [u'<bos>', u'this', u'is', u'a', u'<unk>', u'.', u'<eos>']
    #
    # In [21]: mark.decoded_vec_to_str(y)
예제 #4
0
    :author: Sam Gammon <*****@*****.**>
    :license: This software follows the MIT (OSI-approved)
              license for open source software. A truncated
              version is included here; for full licensing
              details, see ``LICENSE.md`` in the root directory
              of the project.

              Copyright (c) 2013, Keen IO

              The above copyright notice and this permission notice shall be included in
              all copies or substantial portions of the Software.

              THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
              IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
              FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
              AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
              LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
              OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
              THE SOFTWARE.

'''


# testrunner
import runner


## Run the testsuite! :)
runner.fix_path()  # fix sys.path
runner.run(runner.load())
예제 #5
0
파일: main.py 프로젝트: ttakamura/en_marker
sys.path.append('lib')
import corpus
import config
import runner
import mark

conf   = config.parse_args()
corpus = conf.open_corpus()

if conf.mode() == 'console':
    embed()
elif conf.mode() == 'train':
    train_scores, test_scores = runner.train(conf)
    runner.report_bleu_graph(train_scores, test_scores)
elif conf.mode() == 'restore_console':
    encdec, opt, conf = runner.load(conf.load_prefix())
    embed()
    # usage: ---------------------------------------------------------------
    # source = "this is a pen."
    # batch, hyp = runner.predict(conf, encdec, source)
    # x = batch.data_at(0)
    # t, y = hyp[0]
    # mark.decoded_vec_to_str(y)
    #
    # In [24]: corpus.tokenize(source, cleanup_tag=False)
    # Out[24]: [u'<bos>', u'this', u'is', u'a', u'pen', u'.', u'<eos>']
    #
    # In [20]: corpus.ids_to_tokens(x)
    # Out[20]: [u'<bos>', u'this', u'is', u'a', u'<unk>', u'.', u'<eos>']
    #
    # In [21]: mark.decoded_vec_to_str(y)
예제 #6
0
import runner
import mark
from nltk.tokenize import sent_tokenize

model_prefix = 'model/current'
encdec, opt, conf = runner.load(model_prefix)


def split(source_text):
    return sent_tokenize(source_text)


def predict(source_text):
    sources = split(source_text)
    sentences = []
    for source in sources:
        batch, hyp = runner.predict(conf, encdec, source)
        x = conf.corpus.tokenize(source, cleanup_tag=False)
        t, y = hyp[0]
        annotations = mark.decoded_vec_to_hash(y)
        result = []
        for i in range(len(x)):
            result.append({"source": x[i], "annotation": annotations[i]})
        sentences.append(result)
    return sentences