def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') (options, args) = parser.parse_args() ts = corpus2.get_named_tagset(options.tagset) p = wccl.Parser(ts) ops = [] # (name, op) pairs infiles = [] for arg in args: if arg.endswith('.xml'): infiles.append(arg) elif arg.endswith('.ccl'): f = p.parseWcclFileFromPath(arg) ops.extend(f.gen_all_op_pairs()) else: # parse arg as single op string op = p.parseAnyOperator(arg) ops.append((arg, arg)) if ops and infiles: for fname in infiles: rdr = corpus2.TokenReader.create_path_reader(options.input_format, ts, fname) for chunk in chunks(rdr): # dump op names print '\t'.join(name for (name, _) in ops) # iterate and dump values for sent in chunk.sentences(): for con in iter_sent(sent): print '\t'.join(op.base_apply(con).to_string(ts) for (_, op) in ops)
def merge(source_paths, output_path=None, input_format='ccl', output_format='ccl', tagset='nkjp', chunks=False, prefix_chunks=False, prefix_sentences=False, documents_as_chunks=False): # load a tagset, create a reader if isinstance(tagset, str): tagset = corpus2.get_named_tagset(tagset) if output_path: writer = corpus2.TokenWriter.create_path_writer( output_format, output_path, tagset) else: writer = corpus2.TokenWriter.create_stdout_writer( output_format, tagset) for path in source_paths: reader = corpus2.TokenReader.create_path_reader( input_format, tagset, path) fname, _ = os.path.splitext(os.path.basename(path)) fname = escape(fname) if chunks: chunk_no = 1 for chunk in chunks(reader): if prefix_chunks: if chunk.has_attribute('id'): their_id = chunk.get_attribute('id') else: # autogen their_id = ('auto%03d' % chunk_no) full_id = 'file:%s:%s' % (fname, their_id) chunk.set_attribute('id', full_id) writer.write_chunk(chunk) chunk_no += 1 else: big_chunk = None if documents_as_chunks: big_chunk = corpus2.Chunk() big_chunk.set_attribute('id', 'file:%s:%s' % (fname, 'ch1')) sent_no = 1 for sent in sentences(reader): if prefix_sentences: if not sent.id(): their_id = sent.id() else: #autogen their_id = ('s%d' % sent_no) full_id = 'file:%s:%s' % (fname, their_id) sent.set_id(full_id) if big_chunk: big_chunk.append(sent) else: writer.write_sentence(sent) sent_no += 1 if big_chunk: writer.write_chunk(big_chunk) del reader
def __init__(self): self.__wccl_files = [] tagset = corpus2.get_named_tagset(TAGSET) p = wccl.Parser(tagset) for f in get_wccl_files(): self.__wccl_files.append(p.parseWcclFileFromPath(f, RULES_DIR))
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option( '-p', '--par-sep', type='string', action='store', dest='par_sep', default='\n\n', help='set the paragraph separator; default: (two newlines)') parser.add_option('--ignore-ns-sent', action='store_true', default=False, dest='ignore_ns_sent', help='ignore no-space markers on sent boundaries') (options, args) = parser.parse_args() if len(args) != 2: print 'Need to provide input and output.' print 'See --help for details.' print sys.exit(1) fn_input, fn_output = args with codecs.open(fn_output, 'wb', 'utf-8') as out: tagset = corpus2.get_named_tagset(options.tagset) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) first = True while True: par = rdr.get_next_chunk() parfirst = True if not par: break for sent in par.sentences(): sentfirst = True # if non-empty sent sep, skip pre-spaces for tok in sent.tokens(): if not parfirst and ((sentfirst and options.ignore_ns_sent) or tok.after_space()): out.write(' ') out.write(unicode(tok.orth())) sentfirst = False parfirst = False out.write(options.par_sep)
def __init__(self, path): super(CCLNERParser, self).__init__(path) self.tagset = corpus2.get_named_tagset('nkjp') self.input_format = 'ccl' print path self.reader = corpus2.TokenReader.create_path_reader(self.input_format, self.tagset, path) self.sent_struct = {} self.logger = logging.getLogger('CCLNERParser') self.parse()
def __init__(self, liner_ini, tagset='nkjp'): self.tagset = corpus2.get_named_tagset(tagset) ChunkerFactory = JClass("g419.liner2.api.chunker.factory.ChunkerFactory") self.options = JClass("g419.liner2.api.LinerOptions")() self.options.parseModelIni(liner_ini) #self.chunkerManager = ChunkerFactory.loadChunkers(self.options) if not self.options.features.isEmpty(): self.featureGen = JClass("g419.liner2.api.features.TokenFeatureGenerator")(self.options.features) else: self.featureGen = None
def get_morpho(options, corpname, outfname): tagset = corpus2.get_named_tagset(options.tagset) anal = Analyser(tagset, options.keep_case, bool(options.freq_list)) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, corpname) while True: tok = rdr.get_next_token() if not tok: break anal.consume(tok) del rdr anal.save(outfname) if options.freq_list: anal.save_freq(options.freq_list)
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') (options, args) = parser.parse_args() ts = corpus2.get_named_tagset(options.tagset) p = wccl.Parser(ts) ops = [] # (name, op) pairs infiles = [] for arg in args: if arg.endswith('.xml'): infiles.append(arg) elif arg.endswith('.ccl'): f = p.parseWcclFileFromPath(arg) ops.extend(f.gen_all_op_pairs()) else: # parse arg as single op string op = p.parseAnyOperator(arg) ops.append((arg, arg)) if ops and infiles: for fname in infiles: rdr = corpus2.TokenReader.create_path_reader( options.input_format, ts, fname) for chunk in chunks(rdr): # dump op names print '\t'.join(name for (name, _) in ops) # iterate and dump values for sent in chunk.sentences(): for con in iter_sent(sent): print '\t'.join( op.base_apply(con).to_string(ts) for (_, op) in ops)
def test_one_sentence_path_writer(self): path = self._save_temp_file('one_sentence_path_writer.ccl', documents.short_ccl) self.tagset = corpus2.get_named_tagset('nkjp') reader = corpus2mwe.CclMWEReader(path, self.tagset) reader.use_annotations(False) doc = reader.read() writer = corpus2.TokenWriter.create_path_writer( 'ccl:gz', path + '.gz', self.tagset) for chunk in doc.paragraphs(): writer.write_chunk(chunk) del writer readerGZ = corpus2.TokenReader.create_path_reader( 'ccl:gz', self.tagset, path + '.gz') self.assertEqual(u'Szlachetnie urodzona', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'żelazna dziewica', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'napchała się', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'lanymi kluskami', readerGZ.get_next_token().orth_utf8().decode('utf8'))
def setUp(self): self.tagset = corpus2.get_named_tagset('nkjp') self.temp_dir = os.path.join(tempfile.gettempdir(), "corpus2_test") if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) os.mkdir(self.temp_dir)
def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, folds): chan_names = chan_names.split(",") chunkTable = CSVTable(";") chunkTable.addColumn('Nr') headTable = CSVTable(";") headTable.addColumn('Nr') bothTable = CSVTable(";") bothTable.addColumn('Nr') for chan_name in chan_names: chunkTable.addColumn(chan_name) chunkTable.addSubColumn(chan_name, "P", type="float") chunkTable.addSubColumn(chan_name, "R", type="float") chunkTable.addSubColumn(chan_name, "F", type="float") headTable.addColumn(chan_name) headTable.addSubColumn(chan_name, "P", type="float") headTable.addSubColumn(chan_name, "R", type="float") headTable.addSubColumn(chan_name, "F", type="float") bothTable.addColumn(chan_name) bothTable.addSubColumn(chan_name, "P", type="float") bothTable.addSubColumn(chan_name, "R", type="float") bothTable.addSubColumn(chan_name, "F", type="float") tagset = corpus2.get_named_tagset(tagset) for fold in range(1, folds + 1): if folds > 1: ch_path_fold = os.path.join( ch_path, 'ccl-test' + str(fold).zfill(2) + '.xml') ref_path_fold = os.path.join( ref_path, 'ccl-test' + str(fold).zfill(2) + '.xml') else: ch_path_fold = ch_path ref_path_fold = ref_path chunkResults = {} headResults = {} bothResults = {} for chan_name in chan_names: ch_rdr = corpus2.TokenReader.create_path_reader( input_format, tagset, ch_path_fold) ref_rdr = corpus2.TokenReader.create_path_reader( input_format, tagset, ref_path_fold) stats = Stats() while True: # iterate over paragraphs (note that they are called "chunks" here) ref_chunk = ref_rdr.get_next_chunk() ch_chunk = ch_rdr.get_next_chunk() assert (not ref_chunk) == ( not ch_chunk), 'corpora of different length' if not ref_chunk: break # end of input # process each sentence separately for ch_sent, ref_sent in zip(ch_chunk.sentences(), ref_chunk.sentences()): assert ch_sent.size() == ref_sent.size() ch_annots = get_annots(ch_sent, chan_name) ref_annots = get_annots(ref_sent, chan_name) stats.update(ch_annots, ref_annots) chunkResults[chan_name] = stats.getChunkStats() headResults[chan_name] = stats.getHeadStats() bothResults[chan_name] = stats.getBothStats() chunkTable.addRow(chunkResults) headTable.addRow(headResults) bothTable.addRow(bothResults) if folds > 1: chunkTable.countAvg() headTable.countAvg() bothTable.countAvg() if out_path != '': out = codecs.open(out_path, "w", "utf-8") out.write("Chunks--------------------------------------------------\n") out.write(chunkTable.__str__()) out.write("Heads---------------------------------------------------\n") out.write(headTable.__str__()) out.write("Both----------------------------------------------------\n") out.write(bothTable.__str__()) out.close() else: print "Chunks--------------------------------------------------" print chunkTable print "Heads---------------------------------------------------" print headTable print "Both----------------------------------------------------" print bothTable
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') #parser.add_option('-o', '--output-format', type='string', action='store', #dest='output_format', default='xces', #help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='report each token') parser.add_option('-n', '--number-of-tags', type='int', action='store', dest='num_tags', default=10, help='set the max number of tags to report') (options, args) = parser.parse_args() if len(args) != 1: print 'You need to provide an input corpus.' print 'See %s --help' % sys.argv[0] sys.exit(1) inpath = args[0] # load a tagset, create a reader tagset = corpus2.get_named_tagset(options.tagset) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath) # init stats (for this example) num_toks, num_sents, num_chunks = 0, 0, 0 tag_count = dd(int) for chunk in chunks(rdr): for sent in chunk.sentences(): for tok in sent.tokens(): if options.verbose: print tok.orth_utf8() for lex in tok.lexemes(): tag_str = tagset.tag_to_string(lex.tag()) tag_count[tag_str] += 1 if options.verbose: lemma = lex.lemma_utf8() print('+' if lex.is_disamb() else ' '), lemma, tag_str # if you want a unicode object, orth_utf8().decode('utf-8') num_toks += 1 num_sents += 1 num_chunks += 1 print 'Tokens:', num_toks print 'Sents: ', num_sents print 'Chunks:', num_chunks print print 'Most frequent tags:' for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]: print '\t%s\t%d' % tc
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='ccl', help='set the input format; default: ccl') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='ccl', help='set the output format; default: ccl') parser.add_option( '-r', '--result', type='string', action='store', dest='result_path', default='', help= 'set filename for script output, output is appended to the file content; default: stdout' ) parser.add_option('-I', '--input-file', type='string', action='store', dest='in_path', default='', help='set input filename (do not read from stdin)') parser.add_option( '-O', '--output-file', type='string', action='store', dest='out_path', default='', help= 'set the output filename for processed input file with wccl rules applied' ) parser.add_option( '-C', '--chunks', action='store_true', dest='preserve_chunks', default=False, help= 'preserve input paragraph chunks (the default is to read sentences only)' ) parser.add_option( '-D', '--lex-dir', type='string', action='store', dest='lex_dir', default='.', help='use the given directory to look for lexicon files (default: .)') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option( '-q', '--qtype', type='string', action='store', dest='default_qtype', default='unknown', help='set the question type for unresolved instances; default: unknown' ) (options, args) = parser.parse_args() if len(args) != 1: sys.stderr.write('You need to provide a WCCL file.\n') sys.stderr.write('See %s --help\n' % sys.argv[0]) sys.exit(1) wccl_path = args[0] # create a tagset object for all subsequent processing # when reading input corpus, when creating new tags/tokens and when # creating a WCCL parser it must be specified what tagset is being used tagset = corpus2.get_named_tagset(options.tagset) # now instantiate a WCCL parser and parse the provided WCCL file # lex_dir is optional -- if the WCCL code references external # lexicons, they will be sought in the given search path # (defaults to '.') p = wccl.Parser(tagset) wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir) # check if there are any rules in the parsed WCCL file if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules(): sys.stderr.write('The given WCCL file contains no rules.\n') sys.exit(1) # create a corpus2 reader using the given input format specs # (e.g. xces,flat), tagset name # if options.in_path is empty, will create an stdin reader reader = get_reader(options.in_path, tagset, options.input_format) # create a corpus2 writer -- will be used to write the processed corpus writer = None if options.out_path != '': writer = get_writer(options.out_path, tagset, options.output_format) qtypes = list([(QTYPE1, defaultdict(default_int)), (QTYPE2, defaultdict(default_int)), (QTYPE3, defaultdict(default_int))]) while True: chunk = reader.get_next_chunk() if not chunk: break # end of input # process each sentence separately for sent in chunk.sentences(): asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(asent, wccl_file) for token in asent.tokens(): metadata = token.get_metadata() if metadata: for qtype_key, qtype_dict in qtypes: if metadata.has_attribute(qtype_key): attr = metadata.get_attribute(qtype_key) qtype_dict[attr] = qtype_dict[attr] + 1 # save processed object # NOTE: if the input sent was not AnnotatedSentence, the changes # will be discarded if writer: if options.preserve_chunks: writer.write_chunk(chunk) else: for sent in chunk.sentences(): writer.write_sentence(sent) max_label = options.default_qtype summary = defaultdict(default_int) for qtype_key, qtype_dict in qtypes: for key_word in qtype_dict.keys(): summary[key_word] = summary[key_word] + qtype_dict[key_word] if not summary.values(): continue max_value = max(summary.values()) if summary.values().count(max_value) == 1: max_label = max(summary.keys(), key=lambda k: summary[k]) break return_label(max_label, options)
def go(): parser = OptionParser(usage=descr) parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-I', '--input-file', type='string', action='store', dest='in_path', default='', help='set input filename (do not read from stdin)') parser.add_option('-O', '--output-file', type='string', action='store', dest='out_path', default='', help='set output filename (do not write to stdout)') parser.add_option('-C', '--chunks', action='store_true', dest='preserve_chunks', default=False, help='preserve input paragraph chunks (the default is to read sentences only)') parser.add_option('-D', '--lex-dir', type='string', action='store', dest='lex_dir', default='.', help='use the given directory to look for lexicon files (default: .)') parser.add_option('-A', '--ann-info', action='store_true', dest='ann_info', default=False, help='print annotation info') (options, args) = parser.parse_args() print args if len(args) != 1: sys.stderr.write('You need to provide a WCCL file.\n') sys.stderr.write('See %s --help\n' % sys.argv[0]) sys.exit(1) wccl_path = args[0] # create a tagset object for all subsequent processing # when reading input corpus, when creating new tags/tokens and when # creating a WCCL parser it must be specified what tagset is being used tagset = corpus2.get_named_tagset(options.tagset) # now instantiate a WCCL parser and parse the provided WCCL file # lex_dir is optional -- if the WCCL code references external # lexicons, they will be sought in the given search path # (defaults to '.') p = wccl.Parser(tagset) wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir) # check if there are any rules in the parsed WCCL file if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules(): sys.stderr.write('The given WCCL file contains no rules.\n') sys.exit(1) # create a corpus2 reader using the given input format specs # (e.g. xces,flat), tagset name # if options.in_path is empty, will create an stdin reader reader = get_reader(options.in_path, tagset, options.input_format) # create a corpus2 writer -- will be used to write the processed corpus writer = get_writer(options.out_path, tagset, options.output_format) # processing paragraph-by-paragraph if options.preserve_chunks: while True: chunk = reader.get_next_chunk() if not chunk: break # end of input # process each sentence separately for sent in chunk.sentences(): # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(sent, wccl_file, options.ann_info) # save processed chunk # NOTE: if the input sent was not AnnotatedSentence, the changes # will be discarded writer.write_chunk(chunk) else: while True: sent = reader.get_next_sentence() if not sent: break # end of input # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(asent, wccl_file, options.ann_info) # save processed sentence (safe) # NOTE: if the input sent was not AnnotatedSentence, the changes # will be discarded writer.write_sentence(sent)
""" Split annotated data to training and test set(s). Usage: split_data.py by (documents | sentences) <src-dir> percentage <train-pct> [options] split_data.py by (documents | sentences) <src-dir> equal <numfolds> [options] -o=DIR, --output=DIR output dir [default: <src-dir>] """ from docopt import docopt import corpus2 import os from random import shuffle import numpy as np from corpus_merge import merge tagset = corpus2.get_named_tagset('nkjp') def main(): args = docopt(__doc__) print args out_dir = args['--output'] if args['--output'] != '<src-dir>' else args[ '<src-dir'] if args['percentage']: args['<train-pct>'] = int( args['<train-pct>']) # replace with Schema package else: args['<numfolds>'] = int(args['<numfolds>']) positive, negative = [], [] for path in list_files(args['<src-dir>']): file_positive, file_negative = check_file_examples(
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') parser.add_option('-d', '--debug', action='store_true', dest='debug_mode') (options, args) = parser.parse_args() if len(args) != 3: print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.' print 'See --help for details.' print sys.exit(1) tag_fn, mor_fn, out_fn = args tagset = corpus2.get_named_tagset(options.tagset) tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn) mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, mor_fn) writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset) while True: mor_sent = mor_rdr.get_next_sentence() tag_sent = tag_rdr.get_next_sentence() assert (not mor_sent) == (not tag_sent) if not mor_sent: break for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()): assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode( tag_tok.orth()) tag_tok.set_wa(mor_tok.wa()) writer.write_sentence(tag_sent) writer.finish()
def go(): parser = OptionParser( usage="Tool for preparing data for embeddings training") parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option( '-p', '--par-sep', type='string', action='store', dest='par_sep', default='', help='set the paragraph separator; default: (two newlines)') parser.add_option('-s', '--sent-sep', type='string', action='store', dest='sent_sep', default='\n', help='set the sentence separator; default: (newline)') parser.add_option('--separate-tokens', action='store_true', default=True, dest='separate_tokens', help='separate all tokens with space') parser.add_option('--ignore-ns-sent', action='store_true', default=False, dest='ignore_ns_sent', help='ignore no-space markers on sent boundaries') parser.add_option('-f', '--feature', type='string', action='store', dest='feature', default='lemma', help='feature; , default: (lemma)') parser.add_option( '-w', '--wordnet_path', type='string', action='store', dest='wordnet_path', default= '/home/michal/dev/ipi/korpusy/plwordnet_2_1_0/plwordnet_2_1_0_pwn_format', help='wordnet path') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='text', help='set the output format (text); default: text') parser.add_option('-l', '--limit', type='int', action='store', dest='limit', default=0, help='set the tokens number limit; default: 0') parser.add_option( '--liner_jar', type='string', action='store', dest='liner_jar', default= '/home/michal/dev/ipi/liner2/g419-liner2-cli/build/libs/g419-liner2-cli-2.5-SNAPSHOT-all.jar', help='liner jar path, required for wordnet features') parser.add_option('--liner_lib', type='string', action='store', dest='liner_lib', default='/home/michal/dev/ipi/liner2/lib', help='liner lib path, required for wordnet features') (options, args) = parser.parse_args() if len(args) != 2: print 'Need to provide input and output.' print 'See --help for details.' print sys.exit(1) fn_input, fn_output = args def get_wordnet(): return LinerWordnet(options.wordnet_path, options.liner_jar, options.liner_lib) feature_generator = None if options.feature.startswith("hypernym"): wordnet = get_wordnet() feat = wordnet.get_hypernym_feature(options.feature, int(options.feature.split("-")[1])) def feature_generator(tok): return unicode(feat.generate(lemstrings_of_token(tok))) if options.feature.startswith("synonym"): wordnet = get_wordnet() feat = wordnet.get_synonym_feature() def feature_generator(tok): return unicode(feat.generate(lemstrings_of_token(tok))) if options.feature == 'ctag': def feature_generator(tok): return tagstrings_of_token(tok, tagset) if options.feature == 'orth': def feature_generator(tok): return orth_of_token(tok) if options.feature == 'lemma': def feature_generator(tok): return lemstrings_of_token(tok, tagset) if options.feature == 'lemma.ctag': def feature_generator(tok): return lemstrings_of_token( tok, tagset) + '.' + tagstrings_of_token(tok, tagset) if options.feature == 'lemma.class': def feature_generator(tok): return lemstrings_of_token(tok, tagset) + '.' + class_of_token( tok, tagset) if options.feature == 'class': def feature_generator(tok): return class_of_token(tok, tagset) if not feature_generator: print 'Unknown feature' print sys.exit(1) limit = options.limit print("token limit: " + str(limit)) file_encoding = None if options.output_format == 'text': file_encoding = 'utf-8' token_count = 0 with codecs.open(fn_output, 'wb', file_encoding, buffering=16777216) as out: tagset = corpus2.get_named_tagset(options.tagset) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) sentfirst = True # if non-empty sent sep, skip pre-spaces limit_reached = False while not limit_reached: sent = rdr.get_next_sentence() if not sent: break for tok in sent.tokens(): if limit and token_count >= limit: limit_reached = True break token_count += 1 feat_val = feature_generator(tok) if (sentfirst and options.ignore_ns_sent ) or tok.after_space() or options.separate_tokens: out.write(' ') out.write(feat_val) sentfirst = False out.write(options.sent_sep) print('token count: ' + str(token_count))
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') parser.add_option('-s', '--ignore-spaces', action='store_false', default=True, dest='respect_spaces', help='ignore spaces between tokens when comparing') parser.add_option('-d', '--debug', action='store_true', dest='debug_mode') (options, args) = parser.parse_args() if len(args) != 3: print 'You need to provide a REF, REANA and OUTPUT files.' print 'See --help for details.' print sys.exit(1) ref_fn, rea_fn, out_fn = args num_ref_toks = 0 num_segchange_toks = 0 num_handled_toks = 0 num_igned_toks = 0 tagset = corpus2.get_named_tagset(options.tagset) ref_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, ref_fn) rea_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, rea_fn) writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset) stats = Stats(tagset, options.respect_spaces, options.verbose, options.debug_mode) for sent in synced_sents(ref_rdr, rea_rdr, stats): writer.write_sentence(sent) if options.verbose: stats.dump()
def __init__(self, path): self.tagset = corpus2.get_named_tagset('nkjp') self.input_format = 'ccl' self.writer = corpus2.TokenWriter.create_path_writer(self.input_format, path, self.tagset)
def go(): parser = OptionParser(usage=descr) parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-I', '--input-file', type='string', action='store', dest='in_path', default='', help='set input filename (do not read from stdin)') parser.add_option('-O', '--output-file', type='string', action='store', dest='out_path', default='', help='set output filename (do not write to stdout)') parser.add_option('-C', '--chunks', action='store_true', dest='preserve_chunks', default=False, help='preserve input paragraph chunks (the default is to read sentences only)') parser.add_option('-D', '--lex-dir', type='string', action='store', dest='lex_dir', default='.', help='use the given directory to look for lexicon files (default: .)') parser.add_option('-A', '--ann-info', action='store_true', dest='ann_info', default=False, help='print annotation info') (options, args) = parser.parse_args() if len(args) != 1: sys.stderr.write('You need to provide a WCCL file.\n') sys.stderr.write('See %s --help\n' % sys.argv[0]) sys.exit(1) wccl_path = args[0] # create a tagset object for all subsequent processing # when reading input corpus, when creating new tags/tokens and when # creating a WCCL parser it must be specified what tagset is being used tagset = corpus2.get_named_tagset(options.tagset) # now instantiate a WCCL parser and parse the provided WCCL file # lex_dir is optional -- if the WCCL code references external # lexicons, they will be sought in the given search path # (defaults to '.') p = wccl.Parser(tagset) wccl_file = p.parseWcclFileFromPath(wccl_path, options.lex_dir) # check if there are any rules in the parsed WCCL file if not wccl_file.has_tag_rules() and not wccl_file.has_match_rules(): sys.stderr.write('The given WCCL file contains no rules.\n') sys.exit(1) # create a corpus2 reader using the given input format specs # (e.g. xces,flat), tagset name # if options.in_path is empty, will create an stdin reader reader = get_reader(options.in_path, tagset, options.input_format) # create a corpus2 writer -- will be used to write the processed corpus writer = get_writer(options.out_path, tagset, options.output_format) # processing paragraph-by-paragraph if options.preserve_chunks: while True: chunk = reader.get_next_chunk() if not chunk: break # end of input # process each sentence separately for sent in chunk.sentences(): # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(sent, wccl_file, options.ann_info) # save processed chunk # NOTE: if the input sent was not AnnotatedSentence, the changes # will be discarded writer.write_chunk(chunk) else: while True: sent = reader.get_next_sentence() if not sent: break # end of input # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(asent, wccl_file, options.ann_info) # save processed sentence (safe) # NOTE: if the input sent was not AnnotatedSentence, the changes # will be discarded writer.write_sentence(sent)
def process_file(wccl_rules, file_path, annotations): tagset = "nkjp" ctagset = corpus2.get_named_tagset(tagset) channels = ['t3_range', 't3_date', 't3_time', 't3_set', 't3_duration'] p = wccl.Parser(ctagset) wc = p.parseWcclFile(wccl_rules) required = set() display = set() for (name, val) in annotations.items(): if val['required'] == True: required.add(name) display.add(name) items = [] tok_reader = corpus2.TokenReader.create_path_reader( 'ccl', corpus2.get_named_tagset(tagset), file_path) while True: sent = tok_reader.get_next_sentence() if sent: item = "" asent = corpus2.AnnotatedSentence.wrap_sentence(sent) match_rules = wc.get_match_rules_ptr() match_rules.apply_all(asent) # # Anotacje pomocnicze # #aux_channels = [] #for channel in asent.all_channels(): # if channel.startswith("aux_"): # aux_channels.append(channel) #if len(aux_channels)>0: if contains_all(asent.all_channels(), required): aux_start = {} aux_end = set() for channel_name in asent.all_channels(): if channel_name in display: for ann in asent.get_channel( channel_name).make_annotation_vector(): # Starting tags start_index = ann.indices[0] if start_index not in aux_start: aux_start[start_index] = [] aux_start[start_index].append( (channel_name, ann.indices[-1] - ann.indices[0])) # Ending tags aux_end.add("%s:%d" % (channel_name, ann.indices[-1])) if len(aux_start) > 0: item += "<div class='aux'>" for i in range(0, len(asent.tokens())): lexem = asent.tokens()[i].get_preferred_lexeme(ctagset) # tagi rozpoczynajace if i in aux_start: sorted_x = reversed( sorted(aux_start[i], key=operator.itemgetter(1))) for (channel_name, length) in sorted_x: color = "black" if channel_name not in annotations else annotations[ channel_name]['color'] item += "<span class='%s aux' title='%s' style='border-color: %s'>" % ( channel_name, channel_name, color) # tekst ctag = ctagset.tag_to_string(lexem.tag()).split(":") item += "<tok title='%s'>" % (lexem_to_title( lexem, ctagset)) item += str(asent.tokens()[i].orth()) item += "</tok>" # tagi zamykajace for channel_name in display: if channel_name + ":" + str(i) in aux_end: item += "</span>" # separator item += " " item += "</div>" if len(item) > 0: items.append(item) else: break del tok_reader args = {} args["items"] = items return args
# -*- coding: utf-8 -*- """ Script annotating relation contexts beetween given lexical units pairs. Usage: extract_sentences.py <pairs> <src-dir> <out-dir> """ import corpus2 import os from multiprocessing import Process, Queue import itertools import codecs from docopt import docopt NUM_THREADS = 6 MAX_CONTEXT_LEN = 10 tagset = corpus2.get_named_tagset('nkjp') pairs = set() output_dir = '' class Consumer(Process): def __init__(self, task_queue): Process.__init__(self) self.task_queue = task_queue def run(self): while True: path = self.task_queue.get() if path is None: # Poison pill means we should exit break
def test_multiple_sentences_path_writer(self): path = self._save_temp_file('multiple_sentences_path_writer.ccl', documents.many_sentences_ccl) self.tagset = corpus2.get_named_tagset('nkjp') reader = corpus2mwe.CclMWEReader(path, self.tagset) reader.use_annotations(False) doc = reader.read() writer = corpus2.TokenWriter.create_path_writer( 'ccl:gz', path + '.gz', self.tagset) for chunk in doc.paragraphs(): writer.write_chunk(chunk) del writer readerGZ = corpus2.TokenReader.create_path_reader( 'ccl:gz', self.tagset, path + '.gz') self.assertEqual(u'Szlachetnie urodzona', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'żelazna dziewica', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'napchała się', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'lanymi kluskami', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'.', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'Świeżo upieczona', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'juniorka młodsza', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'spotkała', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'płetwala karłowatego', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'i', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'razem', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'z', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'nim', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u',', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'po', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'wiejsku', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u',', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'nacieszyła się', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'zespołem', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'Rittera', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'.', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'Preimplantacyjna diagnostyka', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'jest', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'luksusowym dobrem', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'w', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'socjalistycznym realizmie', readerGZ.get_next_token().orth_utf8().decode('utf8')) self.assertEqual(u'.', readerGZ.get_next_token().orth_utf8().decode('utf8'))
def go(): parser = OptionParser(usage=descr) parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option( '-s', '--stat', action='store_true', dest='stat_mode', help= 'output P,R,f with no text labels, order like in normal mode: \n Chunks or heads \n Chunks and heads \n Chunks match \n Heads match' ) (options, args) = parser.parse_args() stat_mode = options.stat_mode if len(args) != 3: sys.stderr.write('No args. See --help\n') sys.exit(1) batch_ref, batch_target, rel_name = args rel_stats = RelStats() corpus_type = "document" tagset = corpus2.get_named_tagset(options.tagset) ref_count = 0 target_count = 0 ref_file = open(batch_ref, "r") target_file = open(batch_target, "r") line_ref = ref_file.readline() line_target = target_file.readline() while line_ref and line_target: line_ref = line_ref.strip() ref_ccl_filename, ref_rel_filename = line_ref.split(";") line_target = line_target.strip() target_ccl_filename, target_rel_filename = line_target.split(";") ref_ccl_rdr = corpus2.CclRelReader(tagset, ref_ccl_filename, ref_rel_filename) target_ccl_rdr = corpus2.CclRelReader(tagset, target_ccl_filename, target_rel_filename) ref_doc = ref_ccl_rdr.read() target_doc = target_ccl_rdr.read() ref_rels = list(r for r in ref_doc.relations() if r.rel_name() == rel_name) target_rels = list(t for t in target_doc.relations() if t.rel_name() == rel_name) ref_count += len(ref_rels) target_count += len(target_rels) ref_sents = dict([(s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in ref_doc.paragraphs() for s in c.sentences()]) target_sents = dict([ (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in target_doc.paragraphs() for s in c.sentences() ]) for pattern in ref_rels: t = filter(lambda x: (compare(x, pattern) == 0), target_rels) if len(t) > 0: t = t[0] r = pattern both, chun, head = 0, 0, 0 for dir_point_ref, dir_point_target in zip( [r.rel_from(), r.rel_to()], [t.rel_from(), t.rel_to()]): ref_ann_sent = ref_sents[dir_point_ref.sentence_id()] target_ann_sent = target_sents[ dir_point_target.sentence_id()] b, c, h = rel_stats.verify_relation( ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target) both, chun, head = map(sum, zip([b, c, h], [both, chun, head])) rel_stats.update_stats(both, chun, head) line_ref = ref_file.readline() line_target = target_file.readline() rel_stats.print_stats(ref_count, target_count, stat_mode)
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') parser.add_option('-u', '--unk-tag', type='string', action='store', dest='unk_tag', default='ign', help='set the tag used for unknown forms; default: ign') parser.add_option('-k', '--keep-optional', action='store_false', default=True, dest='expand_optional', help='do not expand unspecified optional attributes to multiple tags') parser.add_option('-s', '--ignore-spaces', action='store_false', default=True, dest='respect_spaces', help='ignore spaces between tokens when comparing') parser.add_option('-f', '--first-lexeme-only', action='store_true', default=False, dest='first_lex_only', help='read only each token\'s first disamb lexeme (tag+lemma)') parser.add_option('-d', '--debug', action='store_true', dest='debug_mode') (options, args) = parser.parse_args() if len(args) < 2 or len(args) % 2 != 0: print 'You need to provide a series of tagged folds and a coresponding' print 'series of reference folds.' print 'See --help for details.' print sys.exit(1) tagset = corpus2.get_named_tagset(options.tagset) num_folds = len(args) / 2 weak_lem_lower_bound = 0.0 kn_strong_lem_lower_bound = 0.0 unk_strong_lem_lower_bound = 0.0 strong_lem_lower_bound = 0.0 strong_lem_nocase_lower_bound = 0.0 strong_lem_case_cat_heur = 0.0 strong_lem_nocase_cat_heur = 0.0 weak_lower_bound = 0.0 weak_upper_bound = 0.0 strong_pos_lower = 0.0 unk_weak_lower = 0.0 unk_weak_upper = 0.0 kn_weak_lower = 0.0 kn_weak_upper = 0.0 perc_unk = 0.0 perc_segchange = 0.0 for fold_idx in range(num_folds): tag_fn = args[fold_idx] # filename of tagged fold @ fold_idx ref_fn = args[fold_idx + num_folds] # ... reference fold @ fold_idx if options.verbose: print '### FOLD %2d: %s (tag) v. %s (ref)' % ((fold_idx + 1), tag_fn, ref_fn) tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn) ref_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, ref_fn) res = TokComp( tagset, options.unk_tag, options.expand_optional, options.first_lex_only, options.debug_mode) for tag_seq, ref_seq in tok_seqs(tag_rdr, ref_rdr, options.respect_spaces, options.verbose, options.debug_mode): res.update(tag_seq, ref_seq) print "PolEval 2017 competition scores" print "-------------------------------" print 'POS accuracy (Subtask A score): \t%.4f%%' % res.value_of(Metric.SC_LOWER) print 'POS accuracy (known words): \t%.4f%%' % res.value_of(Metric.KN_SC_LOWER) print 'POS accuracy (unknown words): \t%.4f%%' % res.value_of(Metric.UNK_SC_LOWER) print 'Lemmatization accuracy (Subtask B score): \t%.4f%%' % res.value_of(Metric.SL_LOWER) print 'Lemmatization accuracy (known words): \t%.4f%%' % res.value_of(Metric.KN_SL_LOWER) print 'Lemmatization accuracy (unknown words): \t%.4f%%' % res.value_of(Metric.UNK_SL_LOWER) print 'Overall accuracy (Subtask C score): \t%.4f%%' % ((res.value_of(Metric.SC_LOWER) + res.value_of(Metric.SL_LOWER)) / 2) if options.verbose: res.dump() weak_lem_lower_bound += res.value_of(Metric.WL_LOWER) kn_strong_lem_lower_bound += res.value_of(Metric.KN_SL_LOWER) unk_strong_lem_lower_bound += res.value_of(Metric.UNK_SL_LOWER) strong_lem_lower_bound += res.value_of(Metric.SL_LOWER) strong_lem_nocase_lower_bound += res.value_of(Metric.SL_NOCASE_LOWER) strong_lem_case_cat_heur += res.value_of(Metric.SL_CASE_CAT_HEUR) strong_lem_nocase_cat_heur += res.value_of(Metric.SL_NOCASE_CAT_HEUR) weak_lower_bound += res.value_of(Metric.WC_LOWER) weak_upper_bound += res.value_of(Metric.WC_LOWER) + res.value_of(Metric.SEG_CHANGE) unk_weak_lower += res.value_of(Metric.UNK_WC_LOWER) unk_weak_upper += res.value_of(Metric.UNK_WC_LOWER) + res.value_of(Metric.UNK_SEG_CHANGE) kn_weak_lower += res.value_of(Metric.KN_WC_LOWER) kn_weak_upper += res.value_of(Metric.KN_WC_LOWER) + res.value_of(Metric.KN_SEG_CHANGE) strong_pos_lower += res.value_of(Metric.POS_SC_LOWER) perc_unk += res.value_of(Metric.UNK) perc_segchange += res.value_of(Metric.SEG_CHANGE) # weak lemma -- when sets of possible lemmas output and in ref corp intersect print 'AVG weak lemma lower bound\t%.4f%%' % (weak_lem_lower_bound / num_folds) print 'AVG KN strong lemma lower bound\t%.4f%%' % (kn_strong_lem_lower_bound / num_folds) print 'AVG UNK strong lemma lower bound\t%.4f%%' % (unk_strong_lem_lower_bound / num_folds) # strong lemma -- when sets of possible lemmas output and in ref corp are equal print 'AVG strong lemma lower bound\t%.4f%%' % (strong_lem_lower_bound / num_folds) print 'AVG strong lemma nocase lower bound\t%.4f%%' % (strong_lem_nocase_lower_bound / num_folds) print 'AVG strong lemma case concat heur\t%.4f%%' % (strong_lem_case_cat_heur / num_folds) print 'AVG strong lemma nocase concat heur\t%.4f%%' % (strong_lem_nocase_cat_heur / num_folds) print 'AVG weak corr lower bound\t%.4f%%' % (weak_lower_bound / num_folds) print 'AVG weak corr upper bound\t%.4f%%' % (weak_upper_bound / num_folds) print 'AVG UNK weak corr lower bound\t%.4f%%' % (unk_weak_lower / num_folds) print 'AVG UNK weak corr upper bound\t%.4f%%' % (unk_weak_upper / num_folds) print 'AVG KN weak corr lower bound\t%.4f%%' % (kn_weak_lower / num_folds) print 'AVG KN weak corr upper bound\t%.4f%%' % (kn_weak_upper / num_folds) print 'AVG POS strong corr lower bound\t%.4f%%' % (strong_pos_lower / num_folds) print 'AVG percentage UNK\t%.4f%%' % (perc_unk / num_folds) print 'AVG percentage seg change\t%.4f%%' % (perc_segchange / num_folds)
#!/usr/bin/python # -*- coding: utf-8 -*- import wccl import corpus2 TAGSET = 'nkjp' TAGSET_OBJECT = corpus2.get_named_tagset(TAGSET) #metody pomocnicze, które mogą być nieużywane narazie, ale trochę mi zajęło zanim doszedłem jak to zrobić, #więc zostawiam - może się przydadzą def _get_token_all_classes(token): tag = token.get_preferred_lexeme(TAGSET_OBJECT).tag() return TAGSET_OBJECT.tag_to_symbol_string(tag).split(',') def _get_token_classes(token, name): tag = token.get_preferred_lexeme(TAGSET_OBJECT).tag() mask = corpus2.get_attribute_mask(TAGSET_OBJECT, name) return TAGSET_OBJECT.tag_to_symbol_string(tag.get_masked(mask)).split(',') def _check_token_belong_to_all(token, names): values = _get_token_all_classes(token) for name in names: if not name in values: return False return True def _check_token_belong_to_any(token, names): values = _get_token_all_classes(token) for value in values:
def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option('-f', '--num-folds', type='int', action='store', dest='num_folds', default='10', help='set the number of folds (default: 10)') parser.add_option('-s', '--seed-word', type='string', action='store', dest='seedword', default='korpus', help='set the seedword; default: korpus') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbose mode') (options, args) = parser.parse_args() if len(args) != 2: print 'Need to provide input file and output dir.' print 'See --help for details.' print sys.exit(1) fold_nums = range(options.num_folds) fn_input, fold_dir = args tagset = corpus2.get_named_tagset(options.tagset) # count paragraphs in input if options.verbose: sys.stderr.write('Counting paragraphs... ') rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) num_pars = 0 while True: par = rdr.get_next_chunk() if not par: break num_pars += 1 del rdr if options.verbose: sys.stderr.write('%d\n' % num_pars) # prepare index -- where to send ith paragraph rnd = random.Random(options.seedword) fold_of_par = [(par_idx % options.num_folds) for par_idx in xrange(num_pars)] rnd.shuffle(fold_of_par) # now the real run if options.verbose: sys.stderr.write('Generating folds...\n') rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) fold_test = [corpus2.TokenWriter.create_path_writer( options.output_format, os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset) for num in fold_nums] fold_train = [corpus2.TokenWriter.create_path_writer( options.output_format, os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset) for num in fold_nums] first = True par_now = 0 while True: par = rdr.get_next_chunk() if not par: break fold_now = fold_of_par[par_now] fold_test[fold_now].write_chunk(par) for other_num in fold_nums: if other_num != fold_now: fold_train[other_num].write_chunk(par) #fold_now = (fold_now + 1) % options.num_folds par_now += 1 del rdr for w in fold_test: w.finish() for w in fold_train: w.finish()
def process_files(start, offset, wccl_rules, corpus_path): tagset = "nkjp" ctagset = corpus2.get_named_tagset(tagset) root = os.path.dirname(corpus_path) + "/" list = corpus_path channels = ['t3_range', 't3_date', 't3_time', 't3_set', 't3_duration'] p = wccl.Parser(ctagset) wc = p.parseWcclFile(wccl_rules) processed = 0 docs = {} filenames = codecs.open(list, "r", "utf-8").readlines() docs_filenames = [] for line in filenames[start:start + offset]: processed += 1 line = line.strip() xml_file = (root + line).encode("utf-8") if os.path.isfile(xml_file): docs_filenames.append(line) docs[line] = corpus2.TokenReader.create_path_reader( 'ccl', corpus2.get_named_tagset(tagset), xml_file) else: #print "ERROR: File not found " + xml_file pass items = [] for filename in docs_filenames: tok_reader = docs[filename] while True: sent = tok_reader.get_next_sentence() if sent: asent = corpus2.AnnotatedSentence.wrap_sentence(sent) match_rules = wc.get_match_rules_ptr() ans_ref = set() ans_cmp = set() for channel_name in channels: if asent.has_channel(channel_name): for ann in asent.get_channel( channel_name).make_annotation_vector(): ans_ref.add("%s:%d:%d" % (channel_name, ann.indices[0], ann.indices[-1])) ch = asent.get_channel(channel_name) for i in range(0, len(asent.tokens())): ch.set_segment_at(i, 0) match_rules.apply_all(asent) for channel_name in channels: if asent.has_channel(channel_name): for ann in asent.get_channel( channel_name).make_annotation_vector(): ans_cmp.add("%s:%d:%d" % (channel_name, ann.indices[0], ann.indices[-1])) ends = set() tp = set() fp = set() fn = set() fn_ends = set() for an in ans_cmp: (chan, start, end) = an.split(":") ends.add(chan + ":" + end) if an in ans_ref: tp.add(chan + ":" + start) else: fp.add(chan + ":" + start) for an in ans_ref: (chan, start, end) = an.split(":") if an not in ans_cmp: fn.add(chan + ":" + start) fn_ends.add(chan + ":" + end) item = "" # # Anotacje pomocnicze # aux_channels = [] for channel in asent.all_channels(): if channel.startswith("aux_"): aux_channels.append(channel) if len(aux_channels) > 0: aux_start = set() aux_end = set() for channel_name in aux_channels: for ann in asent.get_channel( channel_name).make_annotation_vector(): aux_start.add("%s:%d" % (channel_name, ann.indices[0])) aux_end.add("%s:%d" % (channel_name, ann.indices[-1])) item += "<div class='aux'><div>Auxiliary annotations:</div>" for i in range(0, len(asent.tokens())): lexem = asent.tokens()[i].get_preferred_lexeme(ctagset) # tagi rozpoczynajace for channel_name in aux_channels: key = "%s:%d" % (channel_name, i) if key in aux_start: item += "<span class='%s' title='%s'>" % ( channel_name, channel_name) # tekst ctag = ctagset.tag_to_string(lexem.tag()).split(":") item += "<tok title='%s'>" % (lexem_to_title( lexem, ctagset)) item += str(asent.tokens()[i].orth()) item += "</tok>" # tagi zamykajace for channel_name in aux_channels: if channel_name + ":" + str(i) in aux_end: item += "</span>" # separator item += " " item += "</div>" # # Rozpoznane anotacje # if len(ans_cmp) > 0 or len(ans_cmp) > 0: item += "<div class='recognized'><div>Recognized annotations:</div>" for i in range(0, len(asent.tokens())): lexem = asent.tokens()[i].get_preferred_lexeme(ctagset) # tagi rozpoczynajace for channel_name in channels: key = "%s:%d" % (channel_name, i) if key in tp: item += "<span class='%s tp' title='%s'>" % ( channel_name, channel_name) if key in fp: item += "<span class='%s fp' title='%s'>" % ( channel_name, channel_name) # tekst ctag = ctagset.tag_to_string(lexem.tag()).split(":") item += "<tok title='%s'>" % (lexem_to_title( lexem, ctagset)) item += str(asent.tokens()[i].orth()) item += "</tok>" # tagi zamykajace for channel_name in channels: if channel_name + ":" + str(i) in ends: item += "</span>" # separator item += " " item += "</div>" # # Anotacje nierozpoznane # if len(fn) > 0: item += "<div class='reference'><div>Missing annotations:</div>" for i in range(0, len(asent.tokens())): lexem = asent.tokens()[i].get_preferred_lexeme(ctagset) # tagi rozpoczynajace for channel_name in channels: key = "%s:%d" % (channel_name, i) if key in fn: item += "<span class='%s fn' title='%s'>" % ( channel_name, channel_name) # tekst ctag = ctagset.tag_to_string(lexem.tag()).split(":") item += "<tok title='%s'>" % (lexem_to_title( lexem, ctagset)) item += str(asent.tokens()[i].orth()) item += "</tok>" # tagi zamykajace for channel_name in channels: if channel_name + ":" + str(i) in fn_ends: item += "</span>" # separator item += " " item += "</div>" if len(item) > 0: id = os.path.basename( filename.encode("utf-8")).split(".")[0] href = '<a href="http://www.nlp.pwr.wroc.pl/inforex?page=report&id=%s" target="_blank">%s</a>' % ( id, filename.encode("utf-8")) items.append("<b>" + href + "</b>" + item) else: break del tok_reader args = {} args["processed"] = processed args["items"] = items args["total"] = len(filenames) return args