def _write_reject(where, dropped_from, errcomms, opcode, rejection): """ where is either: ['fname', fname] or ['docid', document_id, data_sort] errcoms is a list in the form: [[errcode, comments], e... ] where 'errcode' and 'comments' are defined as: comments -- list of comment lines. If there are literal newlines in a comment line, that's fine and new line will be indented. Comments may be the empty list for one or all errorcodes. errcode -- a short string to be looked up in the ERRS table. """ if False: if where[0] == "fname": fname_base = where[1] else: assert where[0] == "docid" document_id, data_sort = where[1:] try: fname_base = output_file_name(document_id, data_sort) except IndexError: print document_id, data_sort raise try: mkdirs(os.path.dirname(fname_base)) except Exception: pass # already exists fname = fname_base + ".rejects" with codecs.open(fname, "a", "utf8") as outf: errnums = [] try: for errcode, comments in errcomms: errnum = "%s%s" % (ERRS[dropped_from][0], ERRS[dropped_from][1][errcode][0]) errmsg = ERRS[dropped_from][1][errcode][1] errnums.append(errnum) outf.write("; %s %s\n" % (errnum, errmsg)) for line in comments: line = unicode2buckwalter(line) indent = " " * 6 try: line = indent + line.replace( "\n", "\n" + indent + " ") except Exception: print "%r" % line raise for subline in line.split("\n"): outf.write("; %s\n" % subline) except ValueError: pprint.pprint(errcomms) raise outf.write("%s %s %s\n;\n;\n" % (opcode, ",".join(errnums), rejection))
def callisto_to_sgml(fname, out_sgml=None, buckit=False, language="unknown", wrap=True): """ given the fname of a callisto xml file, produce either fname.coref or fname.name depending on whether the file represents name or coref annotation. if buckit, then run everything through unicode2buckwalter before writing out if wrap, wrap the whole thing in <DOC ...> </DOC> """ try: from on.common.util import unicode2buckwalter, desubtokenize_annotations except ImportError: raise OnCommonUtilNeededError("callisto_to_sgml") document_id, annotation_opens, annotation_closes, source_text_raw = parse_callisto_xml( fname) filename = None source_text = list(source_text_raw) names = corefs = 0 for open_annotation in annotation_opens: idx = open_annotation[0] if open_annotation[1] == "name": open_annotation_str = "<%s>" % open_annotation[2] names += 1 else: open_annotation_str = '<COREF-ID="%s"-TYPE="%s"%s>' % ( open_annotation[2], open_annotation[3], '-SUBTYPE="%s"' % open_annotation[4] if open_annotation[4] else "") corefs += 1 source_text[idx] = "%s%s" % (open_annotation_str, source_text[idx]) for close_annotation in annotation_closes: idx = close_annotation[0] if close_annotation[1] == "name": close_annotation_str = "</%s>" % close_annotation[2] else: close_annotation_str = "</COREF>" source_text[idx] = "%s%s" % (close_annotation_str, source_text[idx]) if corefs and not names: ext = "coref" elif names and not corefs: ext = "name" elif names and corefs: raise FileContainsBothNameAndCorefAnnotationException(fname) else: raise NoAnnotationFoundException(fname) filename = out_sgml if out_sgml else fname + "." + ext with codecs.open(filename, "w", "utf8") as out_f: if wrap: out_f.write('<DOC DOCNO="%s">\n' % document_id) n_text = "".join(source_text) try: n_text, num_fixed = desubtokenize_annotations( n_text, add_offset_notations=True) except Exception as e: raise DeSubtokenizationFailedException(fname, e) if buckit: n_text = unicode2buckwalter(n_text, sgml_safe=True) n_text = n_text.replace("\r", " ") n_text = n_text.strip() out_f.write(n_text) out_f.write("\n") if wrap: out_f.write('</DOC>\n')