if 'Title' in doc.fields: outs.write(u"%%#Field\tTitle\n") write_text(doc.fields['Title'], outs) outs.write(u"%%#Field\tBody\n") write_text(doc.fields['Body'], outs) def write_text(text, outs): for token in text: outs.write(u"\t".join(token)) outs.write("\n") if __name__ == '__main__': import sys try: params, args = cmd_utils.get_params_sing(sys.argv[1:], 'i:o:m:ta', 'i', 0) if not os.path.isdir(params['i']): raise ValueError('Input must be a directory of files.') except ValueError as err: print('Error: {0}'.format(err)) print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' + '[-a]').format(sys.argv[0])) print(' input_dir: the directory with the input text files.') print(' hunpos_model: the hunpos model file.') print(' output_file: the conll2 output file. If omitted, the result will') print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print(' -a: the output is appended to output_file, instead of overwriting it.') sys.exit() if 'o' in params:
if 'Body' in doc.fields: outs.write(u"%%#Field\tBody\n") write_text(doc.fields['Body'], outs) def write_text(text, outs): for token in text: outs.write(u"\t".join(token)) outs.write("\n") if __name__ == '__main__': import sys try: params, args = cmd_utils.get_params_sing(sys.argv[1:], 'i:o:m:ta', 'i', 0) if not os.path.isdir(params['i']): raise ValueError('Input must be a directory of files.') except ValueError as err: print('Error: {0}'.format(err)) print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' + '[-t] [-a]').format(sys.argv[0])) print(' input_dir: the directory with the input text files.') print(' hunpos_model: the hunpos model file.') print( ' output_file: the conll2 output file. If omitted, the result will' ) print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print( ' -t: If specified, the first non-empty line of the the text files are'
for mapping in mappings: try: key, value = mapping.strip().split("\t") type_map[key] = value except (ValueError): continue return type_map def print_usage_and_exit(): sys.stderr.write('Usage: {0} dbpedia_type_file [-c classes_OWL_file] [-m NE_mappings]\n'.format(__file__)) sys.exit() if __name__ == '__main__': import sys try: params, args = get_params_sing(sys.argv[1:], 'c:m:k', '', 1) except ValueError as ve: sys.stderr.write(ve + "\n") print_usage_and_exit() if len(args) != 1: print_usage_and_exit() # with open(sys.argv[1], 'r', encoding = 'utf-8') as type_stream: with FileReader(args[0], encoding='utf-8').open() as type_stream: lines = merge_pairs(extract_dbpedia_type(type_stream)) filter = __read_map(params['m']) if 'm' in params else None if 'c' in params: lines = filter_general(lines, OwlClassHierarchy(params['c']), filter) if 'm' in params: lines = filter_type(lines, filter, 'k' in params)