示例#1
0
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i', '--input-format', type='string', action='store',
        dest='input_format', default='xces',
        help='set the input format; default: xces')
    parser.add_option('-t', '--tagset', type='string', action='store',
        dest='tagset', default='kipi',
        help='set the tagset used in input; default: kipi')
    (options, args) = parser.parse_args()
    
    ts = corpus2.get_named_tagset(options.tagset)
    p = wccl.Parser(ts)
    
    ops = [] # (name, op) pairs
    infiles = []
    for arg in args:
        if arg.endswith('.xml'):
            infiles.append(arg)
        elif arg.endswith('.ccl'):
            f = p.parseWcclFileFromPath(arg)
            ops.extend(f.gen_all_op_pairs())
        else:
            # parse arg as single op string
            op = p.parseAnyOperator(arg)
            ops.append((arg, arg))
    if ops and infiles:
        for fname in infiles:
            rdr = corpus2.TokenReader.create_path_reader(options.input_format, ts, fname)
            for chunk in chunks(rdr):
                # dump op names
                print '\t'.join(name for (name, _) in ops)
                # iterate and dump values
                for sent in chunk.sentences():
                    for con in iter_sent(sent):
                        print '\t'.join(op.base_apply(con).to_string(ts) for (_, op) in ops)
示例#2
0
def merge(source_paths,
          output_path=None,
          input_format='ccl',
          output_format='ccl',
          tagset='nkjp',
          chunks=False,
          prefix_chunks=False,
          prefix_sentences=False,
          documents_as_chunks=False):
    # load a tagset, create a reader
    if isinstance(tagset, str):
        tagset = corpus2.get_named_tagset(tagset)
    if output_path:
        writer = corpus2.TokenWriter.create_path_writer(
            output_format, output_path, tagset)
    else:
        writer = corpus2.TokenWriter.create_stdout_writer(
            output_format, tagset)
    for path in source_paths:
        reader = corpus2.TokenReader.create_path_reader(
            input_format, tagset, path)
        fname, _ = os.path.splitext(os.path.basename(path))
        fname = escape(fname)
        if chunks:
            chunk_no = 1
            for chunk in chunks(reader):
                if prefix_chunks:
                    if chunk.has_attribute('id'):
                        their_id = chunk.get_attribute('id')
                    else:
                        # autogen
                        their_id = ('auto%03d' % chunk_no)
                    full_id = 'file:%s:%s' % (fname, their_id)
                    chunk.set_attribute('id', full_id)
                writer.write_chunk(chunk)
                chunk_no += 1
        else:
            big_chunk = None
            if documents_as_chunks:
                big_chunk = corpus2.Chunk()
                big_chunk.set_attribute('id', 'file:%s:%s' % (fname, 'ch1'))
            sent_no = 1
            for sent in sentences(reader):
                if prefix_sentences:
                    if not sent.id():
                        their_id = sent.id()
                    else:
                        #autogen
                        their_id = ('s%d' % sent_no)
                    full_id = 'file:%s:%s' % (fname, their_id)
                    sent.set_id(full_id)
                if big_chunk:
                    big_chunk.append(sent)
                else:
                    writer.write_sentence(sent)
                sent_no += 1
            if big_chunk:
                writer.write_chunk(big_chunk)
        del reader
    def __init__(self):
        self.__wccl_files = []
        tagset = corpus2.get_named_tagset(TAGSET)

        p = wccl.Parser(tagset)
        for f in get_wccl_files():

            self.__wccl_files.append(p.parseWcclFileFromPath(f, RULES_DIR))
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='nkjp',
                      help='set the tagset used in input; default: nkjp')
    parser.add_option(
        '-p',
        '--par-sep',
        type='string',
        action='store',
        dest='par_sep',
        default='\n\n',
        help='set the paragraph separator; default: (two newlines)')
    parser.add_option('--ignore-ns-sent',
                      action='store_true',
                      default=False,
                      dest='ignore_ns_sent',
                      help='ignore no-space markers on sent boundaries')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        print 'Need to provide input and output.'
        print 'See --help for details.'
        print
        sys.exit(1)

    fn_input, fn_output = args

    with codecs.open(fn_output, 'wb', 'utf-8') as out:
        tagset = corpus2.get_named_tagset(options.tagset)
        rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, fn_input)
        first = True
        while True:
            par = rdr.get_next_chunk()
            parfirst = True
            if not par:
                break
            for sent in par.sentences():
                sentfirst = True  # if non-empty sent sep, skip pre-spaces
                for tok in sent.tokens():
                    if not parfirst and ((sentfirst and options.ignore_ns_sent)
                                         or tok.after_space()):
                        out.write(' ')
                    out.write(unicode(tok.orth()))
                    sentfirst = False
                    parfirst = False
            out.write(options.par_sep)
示例#5
0
    def __init__(self, path):
        super(CCLNERParser, self).__init__(path)
        self.tagset = corpus2.get_named_tagset('nkjp')
        self.input_format = 'ccl'
        print path
        self.reader = corpus2.TokenReader.create_path_reader(self.input_format, self.tagset, path)
        self.sent_struct = {}

        self.logger = logging.getLogger('CCLNERParser')

        self.parse()
示例#6
0
    def __init__(self, liner_ini, tagset='nkjp'):

        self.tagset = corpus2.get_named_tagset(tagset)
        ChunkerFactory = JClass("g419.liner2.api.chunker.factory.ChunkerFactory")
        self.options = JClass("g419.liner2.api.LinerOptions")()
        self.options.parseModelIni(liner_ini)
        #self.chunkerManager = ChunkerFactory.loadChunkers(self.options)
        if not self.options.features.isEmpty():
            self.featureGen = JClass("g419.liner2.api.features.TokenFeatureGenerator")(self.options.features)
        else:
            self.featureGen = None
def get_morpho(options, corpname, outfname):
    tagset = corpus2.get_named_tagset(options.tagset)
    anal = Analyser(tagset, options.keep_case, bool(options.freq_list))
    rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset,
                                                 corpname)
    while True:
        tok = rdr.get_next_token()
        if not tok:
            break
        anal.consume(tok)
    del rdr
    anal.save(outfname)
    if options.freq_list:
        anal.save_freq(options.freq_list)
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='kipi',
                      help='set the tagset used in input; default: kipi')
    (options, args) = parser.parse_args()

    ts = corpus2.get_named_tagset(options.tagset)
    p = wccl.Parser(ts)

    ops = []  # (name, op) pairs
    infiles = []
    for arg in args:
        if arg.endswith('.xml'):
            infiles.append(arg)
        elif arg.endswith('.ccl'):
            f = p.parseWcclFileFromPath(arg)
            ops.extend(f.gen_all_op_pairs())
        else:
            # parse arg as single op string
            op = p.parseAnyOperator(arg)
            ops.append((arg, arg))
    if ops and infiles:
        for fname in infiles:
            rdr = corpus2.TokenReader.create_path_reader(
                options.input_format, ts, fname)
            for chunk in chunks(rdr):
                # dump op names
                print '\t'.join(name for (name, _) in ops)
                # iterate and dump values
                for sent in chunk.sentences():
                    for con in iter_sent(sent):
                        print '\t'.join(
                            op.base_apply(con).to_string(ts)
                            for (_, op) in ops)
示例#9
0
    def test_one_sentence_path_writer(self):
        path = self._save_temp_file('one_sentence_path_writer.ccl',
                                    documents.short_ccl)
        self.tagset = corpus2.get_named_tagset('nkjp')

        reader = corpus2mwe.CclMWEReader(path, self.tagset)
        reader.use_annotations(False)
        doc = reader.read()
        writer = corpus2.TokenWriter.create_path_writer(
            'ccl:gz', path + '.gz', self.tagset)
        for chunk in doc.paragraphs():
            writer.write_chunk(chunk)
        del writer

        readerGZ = corpus2.TokenReader.create_path_reader(
            'ccl:gz', self.tagset, path + '.gz')
        self.assertEqual(u'Szlachetnie urodzona',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'żelazna dziewica',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'napchała się',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'lanymi kluskami',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
示例#10
0
 def setUp(self):
     self.tagset = corpus2.get_named_tagset('nkjp')
     self.temp_dir = os.path.join(tempfile.gettempdir(), "corpus2_test")
     if os.path.exists(self.temp_dir):
         shutil.rmtree(self.temp_dir)
     os.mkdir(self.temp_dir)
def main(ch_path, ref_path, chan_names, input_format, out_path, tagset,
         verbose, folds):

    chan_names = chan_names.split(",")

    chunkTable = CSVTable(";")
    chunkTable.addColumn('Nr')
    headTable = CSVTable(";")
    headTable.addColumn('Nr')
    bothTable = CSVTable(";")
    bothTable.addColumn('Nr')

    for chan_name in chan_names:
        chunkTable.addColumn(chan_name)
        chunkTable.addSubColumn(chan_name, "P", type="float")
        chunkTable.addSubColumn(chan_name, "R", type="float")
        chunkTable.addSubColumn(chan_name, "F", type="float")
        headTable.addColumn(chan_name)
        headTable.addSubColumn(chan_name, "P", type="float")
        headTable.addSubColumn(chan_name, "R", type="float")
        headTable.addSubColumn(chan_name, "F", type="float")
        bothTable.addColumn(chan_name)
        bothTable.addSubColumn(chan_name, "P", type="float")
        bothTable.addSubColumn(chan_name, "R", type="float")
        bothTable.addSubColumn(chan_name, "F", type="float")

    tagset = corpus2.get_named_tagset(tagset)

    for fold in range(1, folds + 1):
        if folds > 1:
            ch_path_fold = os.path.join(
                ch_path, 'ccl-test' + str(fold).zfill(2) + '.xml')
            ref_path_fold = os.path.join(
                ref_path, 'ccl-test' + str(fold).zfill(2) + '.xml')
        else:
            ch_path_fold = ch_path
            ref_path_fold = ref_path

        chunkResults = {}
        headResults = {}
        bothResults = {}

        for chan_name in chan_names:

            ch_rdr = corpus2.TokenReader.create_path_reader(
                input_format, tagset, ch_path_fold)
            ref_rdr = corpus2.TokenReader.create_path_reader(
                input_format, tagset, ref_path_fold)

            stats = Stats()

            while True:
                # iterate over paragraphs (note that they are called "chunks" here)
                ref_chunk = ref_rdr.get_next_chunk()
                ch_chunk = ch_rdr.get_next_chunk()
                assert (not ref_chunk) == (
                    not ch_chunk), 'corpora of different length'

                if not ref_chunk:
                    break  # end of input

                # process each sentence separately
                for ch_sent, ref_sent in zip(ch_chunk.sentences(),
                                             ref_chunk.sentences()):
                    assert ch_sent.size() == ref_sent.size()
                    ch_annots = get_annots(ch_sent, chan_name)
                    ref_annots = get_annots(ref_sent, chan_name)
                    stats.update(ch_annots, ref_annots)

            chunkResults[chan_name] = stats.getChunkStats()
            headResults[chan_name] = stats.getHeadStats()
            bothResults[chan_name] = stats.getBothStats()

        chunkTable.addRow(chunkResults)
        headTable.addRow(headResults)
        bothTable.addRow(bothResults)
    if folds > 1:
        chunkTable.countAvg()
        headTable.countAvg()
        bothTable.countAvg()

    if out_path != '':
        out = codecs.open(out_path, "w", "utf-8")
        out.write("Chunks--------------------------------------------------\n")
        out.write(chunkTable.__str__())
        out.write("Heads---------------------------------------------------\n")
        out.write(headTable.__str__())
        out.write("Both----------------------------------------------------\n")
        out.write(bothTable.__str__())
        out.close()
    else:
        print "Chunks--------------------------------------------------"
        print chunkTable
        print "Heads---------------------------------------------------"
        print headTable
        print "Both----------------------------------------------------"
        print bothTable
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    #parser.add_option('-o', '--output-format', type='string', action='store',
    #dest='output_format', default='xces',
    #help='set the output format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='kipi',
                      help='set the tagset used in input; default: kipi')
    parser.add_option('-v',
                      '--verbose',
                      action='store_true',
                      dest='verbose',
                      default=False,
                      help='report each token')
    parser.add_option('-n',
                      '--number-of-tags',
                      type='int',
                      action='store',
                      dest='num_tags',
                      default=10,
                      help='set the max number of tags to report')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        print 'You need to provide an input corpus.'
        print 'See %s --help' % sys.argv[0]
        sys.exit(1)

    inpath = args[0]
    # load a tagset, create a reader
    tagset = corpus2.get_named_tagset(options.tagset)
    rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset,
                                                 inpath)
    # init stats (for this example)
    num_toks, num_sents, num_chunks = 0, 0, 0
    tag_count = dd(int)

    for chunk in chunks(rdr):
        for sent in chunk.sentences():
            for tok in sent.tokens():
                if options.verbose:
                    print tok.orth_utf8()

                for lex in tok.lexemes():
                    tag_str = tagset.tag_to_string(lex.tag())
                    tag_count[tag_str] += 1

                    if options.verbose:
                        lemma = lex.lemma_utf8()
                        print('+' if lex.is_disamb() else ' '), lemma, tag_str
                        # if you want a unicode object, orth_utf8().decode('utf-8')
                num_toks += 1
            num_sents += 1
        num_chunks += 1

    print 'Tokens:', num_toks
    print 'Sents: ', num_sents
    print 'Chunks:', num_chunks
    print
    print 'Most frequent tags:'
    for tc in sorted(tag_count.items(), key=lambda tc:
                     (-tc[1], tc[0]))[:options.num_tags]:
        print '\t%s\t%d' % tc
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='ccl',
                      help='set the input format; default: ccl')
    parser.add_option('-o',
                      '--output-format',
                      type='string',
                      action='store',
                      dest='output_format',
                      default='ccl',
                      help='set the output format; default: ccl')
    parser.add_option(
        '-r',
        '--result',
        type='string',
        action='store',
        dest='result_path',
        default='',
        help=
        'set filename for script output, output is appended to the file content; default: stdout'
    )
    parser.add_option('-I',
                      '--input-file',
                      type='string',
                      action='store',
                      dest='in_path',
                      default='',
                      help='set input filename (do not read from stdin)')
    parser.add_option(
        '-O',
        '--output-file',
        type='string',
        action='store',
        dest='out_path',
        default='',
        help=
        'set the output filename for processed input file with wccl rules applied'
    )
    parser.add_option(
        '-C',
        '--chunks',
        action='store_true',
        dest='preserve_chunks',
        default=False,
        help=
        'preserve input paragraph chunks (the default is to read sentences only)'
    )
    parser.add_option(
        '-D',
        '--lex-dir',
        type='string',
        action='store',
        dest='lex_dir',
        default='.',
        help='use the given directory to look for lexicon files (default: .)')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='kipi',
                      help='set the tagset used in input; default: kipi')
    parser.add_option(
        '-q',
        '--qtype',
        type='string',
        action='store',
        dest='default_qtype',
        default='unknown',
        help='set the question type for unresolved instances; default: unknown'
    )

    (options, args) = parser.parse_args()

    if len(args) != 1:
        sys.stderr.write('You need to provide a WCCL file.\n')
        sys.stderr.write('See %s --help\n' % sys.argv[0])
        sys.exit(1)

    wccl_path = args[0]

    # create a tagset object for all subsequent processing
    # when reading input corpus, when creating new tags/tokens and when
    # creating a WCCL parser it must be specified what tagset is being used
    tagset = corpus2.get_named_tagset(options.tagset)

    # now instantiate a WCCL parser and parse the provided WCCL file
    # lex_dir is optional -- if the WCCL code references external
    # lexicons, they will be sought in the given search path
    # (defaults to '.')
    p = wccl.Parser(tagset)
    wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir)
    # check if there are any rules in the parsed WCCL file
    if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules():
        sys.stderr.write('The given WCCL file contains no rules.\n')
        sys.exit(1)

    # create a corpus2 reader using the given input format specs
    # (e.g. xces,flat), tagset name
    # if options.in_path is empty, will create an stdin reader
    reader = get_reader(options.in_path, tagset, options.input_format)
    # create a corpus2 writer -- will be used to write the processed corpus

    writer = None
    if options.out_path != '':
        writer = get_writer(options.out_path, tagset, options.output_format)
    qtypes = list([(QTYPE1, defaultdict(default_int)),
                   (QTYPE2, defaultdict(default_int)),
                   (QTYPE3, defaultdict(default_int))])

    while True:
        chunk = reader.get_next_chunk()
        if not chunk:
            break  # end of input
        # process each sentence separately
        for sent in chunk.sentences():
            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
            process_sent(asent, wccl_file)
            for token in asent.tokens():
                metadata = token.get_metadata()
                if metadata:
                    for qtype_key, qtype_dict in qtypes:
                        if metadata.has_attribute(qtype_key):
                            attr = metadata.get_attribute(qtype_key)
                            qtype_dict[attr] = qtype_dict[attr] + 1
        # save processed object
        # NOTE: if the input sent was not AnnotatedSentence, the changes
        # will be discarded
        if writer:
            if options.preserve_chunks:
                writer.write_chunk(chunk)
            else:
                for sent in chunk.sentences():
                    writer.write_sentence(sent)

    max_label = options.default_qtype
    summary = defaultdict(default_int)
    for qtype_key, qtype_dict in qtypes:
        for key_word in qtype_dict.keys():
            summary[key_word] = summary[key_word] + qtype_dict[key_word]
        if not summary.values():
            continue
        max_value = max(summary.values())
        if summary.values().count(max_value) == 1:
            max_label = max(summary.keys(), key=lambda k: summary[k])
            break

    return_label(max_label, options)
示例#14
0
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-t', '--tagset', type='string', action='store',
        dest='tagset', default='kipi',
        help='set the tagset used in input; default: kipi')
    parser.add_option('-i', '--input-format', type='string', action='store',
        dest='input_format', default='xces',
        help='set the input format; default: xces')
    parser.add_option('-o', '--output-format', type='string', action='store',
        dest='output_format', default='xces',
        help='set the output format; default: xces')
    parser.add_option('-I', '--input-file', type='string', action='store',
        dest='in_path', default='',
        help='set input filename (do not read from stdin)')
    parser.add_option('-O', '--output-file', type='string', action='store',
        dest='out_path', default='',
        help='set output filename (do not write to stdout)')
    parser.add_option('-C', '--chunks', action='store_true',
        dest='preserve_chunks', default=False,
        help='preserve input paragraph chunks (the default is to read sentences only)')
    parser.add_option('-D', '--lex-dir', type='string', action='store',
        dest='lex_dir', default='.',
        help='use the given directory to look for lexicon files (default: .)')
    parser.add_option('-A', '--ann-info', action='store_true',
        dest='ann_info', default=False,
        help='print annotation info')
    
    (options, args) = parser.parse_args()
    print args
    if len(args) != 1:
        sys.stderr.write('You need to provide a WCCL file.\n')
        sys.stderr.write('See %s --help\n' % sys.argv[0])
        sys.exit(1)
    
    wccl_path = args[0]
    
    # create a tagset object for all subsequent processing
    # when reading input corpus, when creating new tags/tokens and when
    # creating a WCCL parser it must be specified what tagset is being used
    tagset = corpus2.get_named_tagset(options.tagset)
    
    # now instantiate a WCCL parser and parse the provided WCCL file
    # lex_dir is optional -- if the WCCL code references external
    # lexicons, they will be sought in the given search path
    # (defaults to '.')
    p = wccl.Parser(tagset)
    wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir)
    # check if there are any rules in the parsed WCCL file
    if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules():
        sys.stderr.write('The given WCCL file contains no rules.\n')
        sys.exit(1)
    
    # create a corpus2 reader using the given input format specs
    # (e.g. xces,flat), tagset name
    # if options.in_path is empty, will create an stdin reader
    reader = get_reader(options.in_path, tagset, options.input_format)
    # create a corpus2 writer -- will be used to write the processed corpus
    writer = get_writer(options.out_path, tagset, options.output_format)
    
    # processing paragraph-by-paragraph
    if options.preserve_chunks:
        while True:
            chunk = reader.get_next_chunk()
            if not chunk:
                break # end of input
            # process each sentence separately
            for sent in chunk.sentences():
                # wrap the sentence as an AnnotatedSentence
                asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
                process_sent(sent, wccl_file, options.ann_info)
            # save processed chunk
            # NOTE: if the input sent was not AnnotatedSentence, the changes
            # will be discarded
            writer.write_chunk(chunk)
    else:
        while True:
            sent = reader.get_next_sentence()
            if not sent:
                break # end of input
            # wrap the sentence as an AnnotatedSentence
            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
            process_sent(asent, wccl_file, options.ann_info)
            # save processed sentence (safe)
            # NOTE: if the input sent was not AnnotatedSentence, the changes
            # will be discarded
            writer.write_sentence(sent)
示例#15
0
"""
Split annotated data to training and test set(s).
Usage:
    split_data.py by (documents | sentences) <src-dir> percentage <train-pct> [options]
    split_data.py by (documents | sentences) <src-dir> equal <numfolds> [options]

    -o=DIR, --output=DIR    output dir [default: <src-dir>]
"""
from docopt import docopt
import corpus2
import os
from random import shuffle
import numpy as np
from corpus_merge import merge

tagset = corpus2.get_named_tagset('nkjp')


def main():
    args = docopt(__doc__)
    print args
    out_dir = args['--output'] if args['--output'] != '<src-dir>' else args[
        '<src-dir']
    if args['percentage']:
        args['<train-pct>'] = int(
            args['<train-pct>'])  # replace with Schema package
    else:
        args['<numfolds>'] = int(args['<numfolds>'])
    positive, negative = [], []
    for path in list_files(args['<src-dir>']):
        file_positive, file_negative = check_file_examples(
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    parser.add_option('-o',
                      '--output-format',
                      type='string',
                      action='store',
                      dest='output_format',
                      default='xces',
                      help='set the output format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='nkjp',
                      help='set the tagset used in input; default: nkjp')
    parser.add_option('-q',
                      '--quiet',
                      action='store_false',
                      default=True,
                      dest='verbose')
    parser.add_option('-d', '--debug', action='store_true', dest='debug_mode')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.'
        print 'See --help for details.'
        print
        sys.exit(1)

    tag_fn, mor_fn, out_fn = args
    tagset = corpus2.get_named_tagset(options.tagset)

    tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, tag_fn)
    mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, mor_fn)

    writer = corpus2.TokenWriter.create_path_writer(options.output_format,
                                                    out_fn, tagset)

    while True:
        mor_sent = mor_rdr.get_next_sentence()
        tag_sent = tag_rdr.get_next_sentence()
        assert (not mor_sent) == (not tag_sent)
        if not mor_sent:
            break
        for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()):
            assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode(
                tag_tok.orth())
            tag_tok.set_wa(mor_tok.wa())
        writer.write_sentence(tag_sent)

    writer.finish()
示例#17
0
def go():
    parser = OptionParser(
        usage="Tool for preparing data for embeddings training")
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='nkjp',
                      help='set the tagset used in input; default: nkjp')
    parser.add_option(
        '-p',
        '--par-sep',
        type='string',
        action='store',
        dest='par_sep',
        default='',
        help='set the paragraph separator; default: (two newlines)')
    parser.add_option('-s',
                      '--sent-sep',
                      type='string',
                      action='store',
                      dest='sent_sep',
                      default='\n',
                      help='set the sentence separator; default: (newline)')
    parser.add_option('--separate-tokens',
                      action='store_true',
                      default=True,
                      dest='separate_tokens',
                      help='separate all tokens with space')
    parser.add_option('--ignore-ns-sent',
                      action='store_true',
                      default=False,
                      dest='ignore_ns_sent',
                      help='ignore no-space markers on sent boundaries')
    parser.add_option('-f',
                      '--feature',
                      type='string',
                      action='store',
                      dest='feature',
                      default='lemma',
                      help='feature; ,  default: (lemma)')
    parser.add_option(
        '-w',
        '--wordnet_path',
        type='string',
        action='store',
        dest='wordnet_path',
        default=
        '/home/michal/dev/ipi/korpusy/plwordnet_2_1_0/plwordnet_2_1_0_pwn_format',
        help='wordnet path')
    parser.add_option('-o',
                      '--output-format',
                      type='string',
                      action='store',
                      dest='output_format',
                      default='text',
                      help='set the output format (text); default: text')
    parser.add_option('-l',
                      '--limit',
                      type='int',
                      action='store',
                      dest='limit',
                      default=0,
                      help='set the tokens number limit; default: 0')
    parser.add_option(
        '--liner_jar',
        type='string',
        action='store',
        dest='liner_jar',
        default=
        '/home/michal/dev/ipi/liner2/g419-liner2-cli/build/libs/g419-liner2-cli-2.5-SNAPSHOT-all.jar',
        help='liner jar path, required for wordnet features')
    parser.add_option('--liner_lib',
                      type='string',
                      action='store',
                      dest='liner_lib',
                      default='/home/michal/dev/ipi/liner2/lib',
                      help='liner lib path, required for wordnet features')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        print 'Need to provide input and output.'
        print 'See --help for details.'
        print
        sys.exit(1)
    fn_input, fn_output = args

    def get_wordnet():
        return LinerWordnet(options.wordnet_path, options.liner_jar,
                            options.liner_lib)

    feature_generator = None
    if options.feature.startswith("hypernym"):
        wordnet = get_wordnet()
        feat = wordnet.get_hypernym_feature(options.feature,
                                            int(options.feature.split("-")[1]))

        def feature_generator(tok):
            return unicode(feat.generate(lemstrings_of_token(tok)))

    if options.feature.startswith("synonym"):
        wordnet = get_wordnet()
        feat = wordnet.get_synonym_feature()

        def feature_generator(tok):
            return unicode(feat.generate(lemstrings_of_token(tok)))

    if options.feature == 'ctag':

        def feature_generator(tok):
            return tagstrings_of_token(tok, tagset)

    if options.feature == 'orth':

        def feature_generator(tok):
            return orth_of_token(tok)

    if options.feature == 'lemma':

        def feature_generator(tok):
            return lemstrings_of_token(tok, tagset)

    if options.feature == 'lemma.ctag':

        def feature_generator(tok):
            return lemstrings_of_token(
                tok, tagset) + '.' + tagstrings_of_token(tok, tagset)

    if options.feature == 'lemma.class':

        def feature_generator(tok):
            return lemstrings_of_token(tok, tagset) + '.' + class_of_token(
                tok, tagset)

    if options.feature == 'class':

        def feature_generator(tok):
            return class_of_token(tok, tagset)

    if not feature_generator:
        print 'Unknown feature'
        print
        sys.exit(1)

    limit = options.limit
    print("token limit: " + str(limit))

    file_encoding = None
    if options.output_format == 'text':
        file_encoding = 'utf-8'

    token_count = 0

    with codecs.open(fn_output, 'wb', file_encoding,
                     buffering=16777216) as out:
        tagset = corpus2.get_named_tagset(options.tagset)
        rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, fn_input)

        sentfirst = True  # if non-empty sent sep, skip pre-spaces
        limit_reached = False
        while not limit_reached:
            sent = rdr.get_next_sentence()
            if not sent:
                break

            for tok in sent.tokens():
                if limit and token_count >= limit:
                    limit_reached = True
                    break

                token_count += 1
                feat_val = feature_generator(tok)

                if (sentfirst and options.ignore_ns_sent
                    ) or tok.after_space() or options.separate_tokens:
                    out.write(' ')
                out.write(feat_val)
                sentfirst = False

            out.write(options.sent_sep)

    print('token count: ' + str(token_count))
示例#18
0
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-i',
                      '--input-format',
                      type='string',
                      action='store',
                      dest='input_format',
                      default='xces',
                      help='set the input format; default: xces')
    parser.add_option('-o',
                      '--output-format',
                      type='string',
                      action='store',
                      dest='output_format',
                      default='xces',
                      help='set the output format; default: xces')
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='nkjp',
                      help='set the tagset used in input; default: nkjp')
    parser.add_option('-q',
                      '--quiet',
                      action='store_false',
                      default=True,
                      dest='verbose')
    parser.add_option('-s',
                      '--ignore-spaces',
                      action='store_false',
                      default=True,
                      dest='respect_spaces',
                      help='ignore spaces between tokens when comparing')
    parser.add_option('-d', '--debug', action='store_true', dest='debug_mode')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print 'You need to provide a REF, REANA and OUTPUT files.'
        print 'See --help for details.'
        print
        sys.exit(1)

    ref_fn, rea_fn, out_fn = args

    num_ref_toks = 0
    num_segchange_toks = 0
    num_handled_toks = 0
    num_igned_toks = 0

    tagset = corpus2.get_named_tagset(options.tagset)

    ref_rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, ref_fn)
    rea_rdr = corpus2.TokenReader.create_path_reader(options.input_format,
                                                     tagset, rea_fn)

    writer = corpus2.TokenWriter.create_path_writer(options.output_format,
                                                    out_fn, tagset)

    stats = Stats(tagset, options.respect_spaces, options.verbose,
                  options.debug_mode)

    for sent in synced_sents(ref_rdr, rea_rdr, stats):
        writer.write_sentence(sent)

    if options.verbose:
        stats.dump()
示例#19
0
 def __init__(self, path):
     self.tagset = corpus2.get_named_tagset('nkjp')
     self.input_format = 'ccl'
     self.writer = corpus2.TokenWriter.create_path_writer(self.input_format, path, self.tagset)
def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='kipi',
		help='set the tagset used in input; default: kipi')
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='xces',
		help='set the input format; default: xces')
	parser.add_option('-o', '--output-format', type='string', action='store',
		dest='output_format', default='xces',
		help='set the output format; default: xces')
	parser.add_option('-I', '--input-file', type='string', action='store',
		dest='in_path', default='',
		help='set input filename (do not read from stdin)')
	parser.add_option('-O', '--output-file', type='string', action='store',
		dest='out_path', default='',
		help='set output filename (do not write to stdout)')
	parser.add_option('-C', '--chunks', action='store_true',
		dest='preserve_chunks', default=False,
		help='preserve input paragraph chunks (the default is to read sentences only)')
	parser.add_option('-D', '--lex-dir', type='string', action='store',
		dest='lex_dir', default='.',
		help='use the given directory to look for lexicon files (default: .)')
	parser.add_option('-A', '--ann-info', action='store_true',
		dest='ann_info', default=False,
		help='print annotation info')
	
	(options, args) = parser.parse_args()
	
	if len(args) != 1:
		sys.stderr.write('You need to provide a WCCL file.\n')
		sys.stderr.write('See %s --help\n' % sys.argv[0])
		sys.exit(1)
	
	wccl_path = args[0]
	
	# create a tagset object for all subsequent processing
	# when reading input corpus, when creating new tags/tokens and when
	# creating a WCCL parser it must be specified what tagset is being used
	tagset = corpus2.get_named_tagset(options.tagset)
	
	# now instantiate a WCCL parser and parse the provided WCCL file
	# lex_dir is optional -- if the WCCL code references external
	# lexicons, they will be sought in the given search path
	# (defaults to '.')
	p = wccl.Parser(tagset)
	wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir)
	# check if there are any rules in the parsed WCCL file
	if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules():
		sys.stderr.write('The given WCCL file contains no rules.\n')
		sys.exit(1)
	
	# create a corpus2 reader using the given input format specs
	# (e.g. xces,flat), tagset name
	# if options.in_path is empty, will create an stdin reader
	reader = get_reader(options.in_path, tagset, options.input_format)
	# create a corpus2 writer -- will be used to write the processed corpus
	writer = get_writer(options.out_path, tagset, options.output_format)
	
	# processing paragraph-by-paragraph
	if options.preserve_chunks:
		while True:
			chunk = reader.get_next_chunk()
			if not chunk:
				break # end of input
			# process each sentence separately
			for sent in chunk.sentences():
				# wrap the sentence as an AnnotatedSentence
				asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
				process_sent(sent, wccl_file, options.ann_info)
			# save processed chunk
			# NOTE: if the input sent was not AnnotatedSentence, the changes
			# will be discarded
			writer.write_chunk(chunk)
	else:
		while True:
			sent = reader.get_next_sentence()
			if not sent:
				break # end of input
			# wrap the sentence as an AnnotatedSentence
			asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
			process_sent(asent, wccl_file, options.ann_info)
			# save processed sentence (safe)
			# NOTE: if the input sent was not AnnotatedSentence, the changes
			# will be discarded
			writer.write_sentence(sent)
示例#21
0
def process_file(wccl_rules, file_path, annotations):
    tagset = "nkjp"
    ctagset = corpus2.get_named_tagset(tagset)
    channels = ['t3_range', 't3_date', 't3_time', 't3_set', 't3_duration']

    p = wccl.Parser(ctagset)
    wc = p.parseWcclFile(wccl_rules)

    required = set()
    display = set()
    for (name, val) in annotations.items():
        if val['required'] == True:
            required.add(name)
        display.add(name)

    items = []
    tok_reader = corpus2.TokenReader.create_path_reader(
        'ccl', corpus2.get_named_tagset(tagset), file_path)
    while True:
        sent = tok_reader.get_next_sentence()
        if sent:
            item = ""
            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
            match_rules = wc.get_match_rules_ptr()
            match_rules.apply_all(asent)

            #
            # Anotacje pomocnicze
            #
            #aux_channels = []
            #for channel in asent.all_channels():
            #	if channel.startswith("aux_"):
            #		aux_channels.append(channel)

            #if len(aux_channels)>0:
            if contains_all(asent.all_channels(), required):
                aux_start = {}
                aux_end = set()

                for channel_name in asent.all_channels():
                    if channel_name in display:
                        for ann in asent.get_channel(
                                channel_name).make_annotation_vector():
                            # Starting tags
                            start_index = ann.indices[0]
                            if start_index not in aux_start:
                                aux_start[start_index] = []
                            aux_start[start_index].append(
                                (channel_name,
                                 ann.indices[-1] - ann.indices[0]))
                            # Ending tags
                            aux_end.add("%s:%d" %
                                        (channel_name, ann.indices[-1]))

                if len(aux_start) > 0:
                    item += "<div class='aux'>"
                    for i in range(0, len(asent.tokens())):
                        lexem = asent.tokens()[i].get_preferred_lexeme(ctagset)
                        # tagi rozpoczynajace
                        if i in aux_start:
                            sorted_x = reversed(
                                sorted(aux_start[i],
                                       key=operator.itemgetter(1)))
                            for (channel_name, length) in sorted_x:
                                color = "black" if channel_name not in annotations else annotations[
                                    channel_name]['color']
                                item += "<span class='%s aux' title='%s' style='border-color: %s'>" % (
                                    channel_name, channel_name, color)
                        # tekst
                        ctag = ctagset.tag_to_string(lexem.tag()).split(":")
                        item += "<tok title='%s'>" % (lexem_to_title(
                            lexem, ctagset))
                        item += str(asent.tokens()[i].orth())
                        item += "</tok>"
                        # tagi zamykajace
                        for channel_name in display:
                            if channel_name + ":" + str(i) in aux_end:
                                item += "</span>"
    # separator
                        item += " "
                    item += "</div>"

            if len(item) > 0:
                items.append(item)

        else:
            break
    del tok_reader

    args = {}
    args["items"] = items
    return args
# -*- coding: utf-8 -*-
"""
Script annotating relation contexts beetween given lexical units pairs.
Usage:
    extract_sentences.py <pairs> <src-dir> <out-dir>
"""
import corpus2
import os
from multiprocessing import Process, Queue
import itertools
import codecs
from docopt import docopt

NUM_THREADS = 6
MAX_CONTEXT_LEN = 10
tagset = corpus2.get_named_tagset('nkjp')
pairs = set()
output_dir = ''


class Consumer(Process):
    def __init__(self, task_queue):
        Process.__init__(self)
        self.task_queue = task_queue

    def run(self):
        while True:
            path = self.task_queue.get()
            if path is None:
                # Poison pill means we should exit
                break
示例#23
0
    def test_multiple_sentences_path_writer(self):
        path = self._save_temp_file('multiple_sentences_path_writer.ccl',
                                    documents.many_sentences_ccl)
        self.tagset = corpus2.get_named_tagset('nkjp')

        reader = corpus2mwe.CclMWEReader(path, self.tagset)
        reader.use_annotations(False)
        doc = reader.read()
        writer = corpus2.TokenWriter.create_path_writer(
            'ccl:gz', path + '.gz', self.tagset)
        for chunk in doc.paragraphs():
            writer.write_chunk(chunk)
        del writer

        readerGZ = corpus2.TokenReader.create_path_reader(
            'ccl:gz', self.tagset, path + '.gz')
        self.assertEqual(u'Szlachetnie urodzona',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'żelazna dziewica',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'napchała się',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'lanymi kluskami',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'.',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'Świeżo upieczona',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'juniorka młodsza',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'spotkała',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'płetwala karłowatego',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'i',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'razem',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'z',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'nim',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u',',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'po',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'wiejsku',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u',',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'nacieszyła się',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'zespołem',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'Rittera',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'.',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'Preimplantacyjna diagnostyka',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'jest',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'luksusowym dobrem',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'w',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'socjalistycznym realizmie',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
        self.assertEqual(u'.',
                         readerGZ.get_next_token().orth_utf8().decode('utf8'))
示例#24
0
def go():
    parser = OptionParser(usage=descr)
    parser.add_option('-t',
                      '--tagset',
                      type='string',
                      action='store',
                      dest='tagset',
                      default='nkjp',
                      help='set the tagset used in input; default: nkjp')
    parser.add_option(
        '-s',
        '--stat',
        action='store_true',
        dest='stat_mode',
        help=
        'output P,R,f with no text labels, order like in normal mode: \n Chunks or heads \n Chunks and heads \n Chunks match \n Heads match'
    )
    (options, args) = parser.parse_args()

    stat_mode = options.stat_mode

    if len(args) != 3:
        sys.stderr.write('No args. See --help\n')
        sys.exit(1)

    batch_ref, batch_target, rel_name = args
    rel_stats = RelStats()

    corpus_type = "document"
    tagset = corpus2.get_named_tagset(options.tagset)

    ref_count = 0
    target_count = 0

    ref_file = open(batch_ref, "r")
    target_file = open(batch_target, "r")
    line_ref = ref_file.readline()
    line_target = target_file.readline()
    while line_ref and line_target:

        line_ref = line_ref.strip()
        ref_ccl_filename, ref_rel_filename = line_ref.split(";")

        line_target = line_target.strip()
        target_ccl_filename, target_rel_filename = line_target.split(";")

        ref_ccl_rdr = corpus2.CclRelReader(tagset, ref_ccl_filename,
                                           ref_rel_filename)
        target_ccl_rdr = corpus2.CclRelReader(tagset, target_ccl_filename,
                                              target_rel_filename)

        ref_doc = ref_ccl_rdr.read()
        target_doc = target_ccl_rdr.read()

        ref_rels = list(r for r in ref_doc.relations()
                        if r.rel_name() == rel_name)
        target_rels = list(t for t in target_doc.relations()
                           if t.rel_name() == rel_name)
        ref_count += len(ref_rels)
        target_count += len(target_rels)

        ref_sents = dict([(s.id(), corpus2.AnnotatedSentence.wrap_sentence(s))
                          for c in ref_doc.paragraphs()
                          for s in c.sentences()])
        target_sents = dict([
            (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s))
            for c in target_doc.paragraphs() for s in c.sentences()
        ])

        for pattern in ref_rels:
            t = filter(lambda x: (compare(x, pattern) == 0), target_rels)
            if len(t) > 0:
                t = t[0]
                r = pattern

                both, chun, head = 0, 0, 0
                for dir_point_ref, dir_point_target in zip(
                    [r.rel_from(), r.rel_to()],
                    [t.rel_from(), t.rel_to()]):
                    ref_ann_sent = ref_sents[dir_point_ref.sentence_id()]
                    target_ann_sent = target_sents[
                        dir_point_target.sentence_id()]
                    b, c, h = rel_stats.verify_relation(
                        ref_ann_sent, dir_point_ref, target_ann_sent,
                        dir_point_target)
                    both, chun, head = map(sum,
                                           zip([b, c, h], [both, chun, head]))
                rel_stats.update_stats(both, chun, head)
        line_ref = ref_file.readline()
        line_target = target_file.readline()

    rel_stats.print_stats(ref_count, target_count, stat_mode)
示例#25
0
def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='xces',
		help='set the input format; default: xces')
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='nkjp',
		help='set the tagset used in input; default: nkjp')
	parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose')
	parser.add_option('-u', '--unk-tag', type='string', action='store',
		dest='unk_tag', default='ign',
		help='set the tag used for unknown forms; default: ign')
	parser.add_option('-k', '--keep-optional', action='store_false',
		default=True, dest='expand_optional',
		help='do not expand unspecified optional attributes to multiple tags')
	parser.add_option('-s', '--ignore-spaces', action='store_false',
		default=True, dest='respect_spaces',
		help='ignore spaces between tokens when comparing')
	parser.add_option('-f', '--first-lexeme-only', action='store_true',
		default=False, dest='first_lex_only',
		help='read only each token\'s first disamb lexeme (tag+lemma)')
	parser.add_option('-d', '--debug', action='store_true', dest='debug_mode')
	(options, args) = parser.parse_args()
	
	if len(args) < 2 or len(args) % 2 != 0:
		print 'You need to provide a series of tagged folds and a coresponding'
		print 'series of reference folds.'
		print 'See --help for details.'
		print
		sys.exit(1)
	
	tagset = corpus2.get_named_tagset(options.tagset)
	
	num_folds = len(args) / 2
	
	weak_lem_lower_bound = 0.0
	kn_strong_lem_lower_bound = 0.0
	unk_strong_lem_lower_bound = 0.0
	strong_lem_lower_bound = 0.0
	strong_lem_nocase_lower_bound = 0.0
	strong_lem_case_cat_heur = 0.0
	strong_lem_nocase_cat_heur = 0.0
	
	weak_lower_bound = 0.0
	weak_upper_bound = 0.0
	
	strong_pos_lower = 0.0

	unk_weak_lower = 0.0
	unk_weak_upper = 0.0
	kn_weak_lower = 0.0
	kn_weak_upper = 0.0
	
	perc_unk = 0.0
	perc_segchange = 0.0
	
	for fold_idx in range(num_folds):
		tag_fn = args[fold_idx] # filename of tagged fold @ fold_idx
		ref_fn = args[fold_idx + num_folds] # ... reference fold @ fold_idx
		if options.verbose:
			print '### FOLD %2d: %s (tag) v. %s (ref)' % ((fold_idx + 1), tag_fn, ref_fn)
		tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn)
		ref_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, ref_fn)
		
		res = TokComp(
			tagset, options.unk_tag, options.expand_optional,
			options.first_lex_only, options.debug_mode)
		for tag_seq, ref_seq in tok_seqs(tag_rdr, ref_rdr, options.respect_spaces, options.verbose, options.debug_mode):
			res.update(tag_seq, ref_seq)

		print "PolEval 2017 competition scores"
		print "-------------------------------"
		print 'POS accuracy (Subtask A score): \t%.4f%%' % res.value_of(Metric.SC_LOWER)
		print 'POS accuracy (known words): \t%.4f%%' % res.value_of(Metric.KN_SC_LOWER)
		print 'POS accuracy (unknown words): \t%.4f%%' % res.value_of(Metric.UNK_SC_LOWER)
		print 'Lemmatization accuracy (Subtask B score): \t%.4f%%' % res.value_of(Metric.SL_LOWER)
		print 'Lemmatization accuracy (known words): \t%.4f%%' % res.value_of(Metric.KN_SL_LOWER)
		print 'Lemmatization accuracy (unknown words): \t%.4f%%' % res.value_of(Metric.UNK_SL_LOWER)
		print 'Overall accuracy (Subtask C score): \t%.4f%%' % ((res.value_of(Metric.SC_LOWER) + res.value_of(Metric.SL_LOWER)) / 2)


		if options.verbose:
			res.dump()
		
		weak_lem_lower_bound += res.value_of(Metric.WL_LOWER)
		kn_strong_lem_lower_bound += res.value_of(Metric.KN_SL_LOWER)
		unk_strong_lem_lower_bound += res.value_of(Metric.UNK_SL_LOWER)
		strong_lem_lower_bound += res.value_of(Metric.SL_LOWER)
		
		strong_lem_nocase_lower_bound += res.value_of(Metric.SL_NOCASE_LOWER)
		
		strong_lem_case_cat_heur += res.value_of(Metric.SL_CASE_CAT_HEUR)
		strong_lem_nocase_cat_heur += res.value_of(Metric.SL_NOCASE_CAT_HEUR)
		
		weak_lower_bound += res.value_of(Metric.WC_LOWER)
		weak_upper_bound += res.value_of(Metric.WC_LOWER) + res.value_of(Metric.SEG_CHANGE)
		unk_weak_lower += res.value_of(Metric.UNK_WC_LOWER)
		unk_weak_upper += res.value_of(Metric.UNK_WC_LOWER) + res.value_of(Metric.UNK_SEG_CHANGE)
		kn_weak_lower += res.value_of(Metric.KN_WC_LOWER)
		kn_weak_upper += res.value_of(Metric.KN_WC_LOWER) + res.value_of(Metric.KN_SEG_CHANGE)
		strong_pos_lower += res.value_of(Metric.POS_SC_LOWER)
		perc_unk += res.value_of(Metric.UNK)
		perc_segchange += res.value_of(Metric.SEG_CHANGE)
	
	# weak lemma -- when sets of possible lemmas output and in ref corp intersect
	print 'AVG weak lemma lower bound\t%.4f%%' % (weak_lem_lower_bound / num_folds)
	print 'AVG KN strong lemma lower bound\t%.4f%%' % (kn_strong_lem_lower_bound / num_folds)
	print 'AVG UNK strong lemma lower bound\t%.4f%%' % (unk_strong_lem_lower_bound / num_folds)
	# strong lemma -- when sets of possible lemmas output and in ref corp are equal
	print 'AVG strong lemma lower bound\t%.4f%%' % (strong_lem_lower_bound / num_folds)
	print 'AVG strong lemma nocase lower bound\t%.4f%%' % (strong_lem_nocase_lower_bound / num_folds)
	
	print 'AVG strong lemma case concat heur\t%.4f%%' % (strong_lem_case_cat_heur / num_folds)
	print 'AVG strong lemma nocase concat heur\t%.4f%%' % (strong_lem_nocase_cat_heur / num_folds)
	
	print 'AVG weak corr lower bound\t%.4f%%' % (weak_lower_bound / num_folds)
	print 'AVG weak corr upper bound\t%.4f%%' % (weak_upper_bound / num_folds)
	
	print 'AVG UNK weak corr lower bound\t%.4f%%' % (unk_weak_lower / num_folds)
	print 'AVG UNK weak corr upper bound\t%.4f%%' % (unk_weak_upper / num_folds)
	print 'AVG KN  weak corr lower bound\t%.4f%%' % (kn_weak_lower / num_folds)
	print 'AVG KN  weak corr upper bound\t%.4f%%' % (kn_weak_upper / num_folds)
	print 'AVG POS strong corr lower bound\t%.4f%%' % (strong_pos_lower / num_folds)
	
	print 'AVG percentage UNK\t%.4f%%' % (perc_unk / num_folds)
	print 'AVG percentage seg change\t%.4f%%' % (perc_segchange / num_folds)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import wccl
import corpus2
TAGSET = 'nkjp'
TAGSET_OBJECT = corpus2.get_named_tagset(TAGSET)


#metody pomocnicze, które mogą być nieużywane narazie, ale trochę mi zajęło zanim doszedłem jak to zrobić,
#więc zostawiam - może się przydadzą
def _get_token_all_classes(token):
    tag = token.get_preferred_lexeme(TAGSET_OBJECT).tag()
    return TAGSET_OBJECT.tag_to_symbol_string(tag).split(',')

def _get_token_classes(token, name):
    tag = token.get_preferred_lexeme(TAGSET_OBJECT).tag()
    mask = corpus2.get_attribute_mask(TAGSET_OBJECT, name)
    return TAGSET_OBJECT.tag_to_symbol_string(tag.get_masked(mask)).split(',')

def _check_token_belong_to_all(token, names):
    values = _get_token_all_classes(token)
    for name in names:
        if not name in values:
            return False
    return True



def _check_token_belong_to_any(token, names):
    values = _get_token_all_classes(token)
    for value in values:
示例#27
0
def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='xces',
		help='set the input format; default: xces')
	parser.add_option('-o', '--output-format', type='string', action='store',
		dest='output_format', default='xces',
		help='set the output format; default: xces')
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='nkjp',
		help='set the tagset used in input; default: nkjp')
	parser.add_option('-f', '--num-folds', type='int', action='store',
		dest='num_folds', default='10',
		help='set the number of folds (default: 10)')
	parser.add_option('-s', '--seed-word', type='string', action='store',
		dest='seedword', default='korpus',
		help='set the seedword; default: korpus')
	parser.add_option('-v', '--verbose', action='store_true',
		dest='verbose', default=False,
		help='verbose mode')
	
	(options, args) = parser.parse_args()
	if len(args) != 2:
		print 'Need to provide input file and output dir.'
		print 'See --help for details.'
		print
		sys.exit(1)
	
	fold_nums = range(options.num_folds)
	fn_input, fold_dir = args
	
	tagset = corpus2.get_named_tagset(options.tagset)
	# count paragraphs in input
	if options.verbose:
		sys.stderr.write('Counting paragraphs... ')
	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
	num_pars = 0
	while True:
		par = rdr.get_next_chunk()
		if not par:
			break
		num_pars += 1
	del rdr
	if options.verbose:
		sys.stderr.write('%d\n' % num_pars)
	# prepare index -- where to send ith paragraph
	rnd = random.Random(options.seedword)
	fold_of_par = [(par_idx % options.num_folds) for par_idx in xrange(num_pars)]
	rnd.shuffle(fold_of_par)
	# now the real run
	if options.verbose:
		sys.stderr.write('Generating folds...\n')
	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
	fold_test = [corpus2.TokenWriter.create_path_writer(
			options.output_format,
			os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset)
			for num in fold_nums]
	fold_train = [corpus2.TokenWriter.create_path_writer(
			options.output_format,
			os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
			for num in fold_nums]
	first = True
	par_now = 0
	while True:
		par = rdr.get_next_chunk()
		if not par:
			break
		fold_now = fold_of_par[par_now]
		fold_test[fold_now].write_chunk(par)
		for other_num in fold_nums:
			if other_num != fold_now:
				fold_train[other_num].write_chunk(par)
		
		#fold_now = (fold_now + 1) % options.num_folds
		par_now += 1
	del rdr
	for w in fold_test: w.finish()
	for w in fold_train: w.finish()
示例#28
0
def process_files(start, offset, wccl_rules, corpus_path):
    tagset = "nkjp"
    ctagset = corpus2.get_named_tagset(tagset)
    root = os.path.dirname(corpus_path) + "/"
    list = corpus_path
    channels = ['t3_range', 't3_date', 't3_time', 't3_set', 't3_duration']

    p = wccl.Parser(ctagset)
    wc = p.parseWcclFile(wccl_rules)

    processed = 0
    docs = {}
    filenames = codecs.open(list, "r", "utf-8").readlines()
    docs_filenames = []
    for line in filenames[start:start + offset]:
        processed += 1
        line = line.strip()
        xml_file = (root + line).encode("utf-8")
        if os.path.isfile(xml_file):
            docs_filenames.append(line)
            docs[line] = corpus2.TokenReader.create_path_reader(
                'ccl', corpus2.get_named_tagset(tagset), xml_file)
        else:
            #print "ERROR: File not found " + xml_file
            pass

    items = []
    for filename in docs_filenames:
        tok_reader = docs[filename]
        while True:
            sent = tok_reader.get_next_sentence()
            if sent:
                asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
                match_rules = wc.get_match_rules_ptr()

                ans_ref = set()
                ans_cmp = set()

                for channel_name in channels:
                    if asent.has_channel(channel_name):
                        for ann in asent.get_channel(
                                channel_name).make_annotation_vector():
                            ans_ref.add("%s:%d:%d" %
                                        (channel_name, ann.indices[0],
                                         ann.indices[-1]))
                        ch = asent.get_channel(channel_name)
                        for i in range(0, len(asent.tokens())):
                            ch.set_segment_at(i, 0)

                match_rules.apply_all(asent)

                for channel_name in channels:
                    if asent.has_channel(channel_name):
                        for ann in asent.get_channel(
                                channel_name).make_annotation_vector():
                            ans_cmp.add("%s:%d:%d" %
                                        (channel_name, ann.indices[0],
                                         ann.indices[-1]))

                ends = set()
                tp = set()
                fp = set()
                fn = set()
                fn_ends = set()

                for an in ans_cmp:
                    (chan, start, end) = an.split(":")
                    ends.add(chan + ":" + end)
                    if an in ans_ref:
                        tp.add(chan + ":" + start)
                    else:
                        fp.add(chan + ":" + start)

                for an in ans_ref:
                    (chan, start, end) = an.split(":")
                    if an not in ans_cmp:
                        fn.add(chan + ":" + start)
                        fn_ends.add(chan + ":" + end)

                item = ""

                #
                # Anotacje pomocnicze
                #
                aux_channels = []
                for channel in asent.all_channels():
                    if channel.startswith("aux_"):
                        aux_channels.append(channel)

                if len(aux_channels) > 0:
                    aux_start = set()
                    aux_end = set()

                    for channel_name in aux_channels:
                        for ann in asent.get_channel(
                                channel_name).make_annotation_vector():
                            aux_start.add("%s:%d" %
                                          (channel_name, ann.indices[0]))
                            aux_end.add("%s:%d" %
                                        (channel_name, ann.indices[-1]))

                    item += "<div class='aux'><div>Auxiliary annotations:</div>"
                    for i in range(0, len(asent.tokens())):
                        lexem = asent.tokens()[i].get_preferred_lexeme(ctagset)
                        # tagi rozpoczynajace
                        for channel_name in aux_channels:
                            key = "%s:%d" % (channel_name, i)
                            if key in aux_start:
                                item += "<span class='%s' title='%s'>" % (
                                    channel_name, channel_name)
                        # tekst
                        ctag = ctagset.tag_to_string(lexem.tag()).split(":")
                        item += "<tok title='%s'>" % (lexem_to_title(
                            lexem, ctagset))
                        item += str(asent.tokens()[i].orth())
                        item += "</tok>"
                        # tagi zamykajace
                        for channel_name in aux_channels:
                            if channel_name + ":" + str(i) in aux_end:
                                item += "</span>"
                        # separator
                        item += " "
                    item += "</div>"

                #
                # Rozpoznane anotacje
                #
                if len(ans_cmp) > 0 or len(ans_cmp) > 0:
                    item += "<div class='recognized'><div>Recognized annotations:</div>"
                    for i in range(0, len(asent.tokens())):
                        lexem = asent.tokens()[i].get_preferred_lexeme(ctagset)
                        # tagi rozpoczynajace
                        for channel_name in channels:
                            key = "%s:%d" % (channel_name, i)
                            if key in tp:
                                item += "<span class='%s tp' title='%s'>" % (
                                    channel_name, channel_name)
                            if key in fp:
                                item += "<span class='%s fp' title='%s'>" % (
                                    channel_name, channel_name)
                        # tekst
                        ctag = ctagset.tag_to_string(lexem.tag()).split(":")
                        item += "<tok title='%s'>" % (lexem_to_title(
                            lexem, ctagset))
                        item += str(asent.tokens()[i].orth())
                        item += "</tok>"
                        # tagi zamykajace
                        for channel_name in channels:
                            if channel_name + ":" + str(i) in ends:
                                item += "</span>"
                        # separator
                        item += " "
                    item += "</div>"

                #
                # Anotacje nierozpoznane
                #
                if len(fn) > 0:
                    item += "<div class='reference'><div>Missing annotations:</div>"
                    for i in range(0, len(asent.tokens())):
                        lexem = asent.tokens()[i].get_preferred_lexeme(ctagset)
                        # tagi rozpoczynajace
                        for channel_name in channels:
                            key = "%s:%d" % (channel_name, i)
                            if key in fn:
                                item += "<span class='%s fn' title='%s'>" % (
                                    channel_name, channel_name)
                        # tekst
                        ctag = ctagset.tag_to_string(lexem.tag()).split(":")
                        item += "<tok title='%s'>" % (lexem_to_title(
                            lexem, ctagset))
                        item += str(asent.tokens()[i].orth())
                        item += "</tok>"
                        # tagi zamykajace
                        for channel_name in channels:
                            if channel_name + ":" + str(i) in fn_ends:
                                item += "</span>"
                        # separator
                        item += " "
                    item += "</div>"

                if len(item) > 0:
                    id = os.path.basename(
                        filename.encode("utf-8")).split(".")[0]
                    href = '<a href="http://www.nlp.pwr.wroc.pl/inforex?page=report&id=%s" target="_blank">%s</a>' % (
                        id, filename.encode("utf-8"))
                    items.append("<b>" + href + "</b>" + item)

            else:
                break
        del tok_reader

    args = {}
    args["processed"] = processed
    args["items"] = items
    args["total"] = len(filenames)
    return args