Пример #1
0
def eval_mst(model_path, test_path, out_prefix, lowercase=True, tagger=None, force=False, result_strem=None):
    mp = MSTParser()

    # -------------------------------------------
    # Use the output prefix to create some new files.
    # -------------------------------------------
    eval_path = out_prefix + '_eval_tagged.txt'
    out_path  = out_prefix + '_out_tagged.txt'


    # -------------------------------------------
    # Rewrite the test file; POS tag the data
    # with the POS tags from our tagger,
    # and strip features.
    # -------------------------------------------
    if not os.path.exists(eval_path) or force:
        LOG.log(1000, "")
        cc = ConllCorpus.read(test_path, lowercase=True)
        if lowercase:
            cc.lower()
        cc.strip_tags()
        cc.strip_feats()
        if tagger is not None:
            LOG.log(1000, "POS Tagging evaluation ")
            cc.tag(StanfordPOSTagger(tagger))
        os.makedirs(os.path.dirname(eval_path), exist_ok=True)
        cc.write(eval_path)
    # -------------------------------------------


    mp.test(model_path, eval_path, out_path)
    eval_conll_paths(test_path, out_path)
Пример #2
0
def train_tagger(prefix, slashtags=[], conll=[], tagmap = None, lowercase=False):



    trainsents = []

    for c in conll:
        cc = ConllCorpus.read(c, lowercase=lowercase, tagmap=tagmap)
        for sent in cc:
            trainsents.append(sent.slashtags())

    for st in slashtags:
        raise NotImplementedError

    alldatatrain = NamedTemporaryFile('w', delete=False)

    # -------------------------------------------
    # Now write all the training sentences out to the temporary file.
    # -------------------------------------------
    for trainsent in trainsents:
        alldatatrain.write(trainsent+'\n')
    alldatatrain.close()

    # -------------------------------------------
    # And train the tagger.
    # -------------------------------------------
    r = stanford_tagger.train_postagger(alldatatrain.name, prefix+'.tagger')
    unlink(alldatatrain.name)
Пример #3
0
def conll_to_slashtags(infiles, outpath):
    '''
    This will
    @param infiles: A list of CONLL pathnames to convert.
    @param outpath: A single output pathname for the slashtags.
    '''
    main_c = POSCorpus()
    for f in infiles:
        from intent.corpora.conll import ConllCorpus
        cp = ConllCorpus.read(f)

        main_c.extend(c)

    st = c.slashtags('/', lowercase=True)

    # Create the containing path if it doesn't already exist
    os.makedirs(os.path.dirname(outpath), exist_ok=True)

    of = open(outpath, 'w', encoding='utf-8')
    of.write(st)
    of.close()