Python unicode2buckwalter 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: on.common.util

메소드/함수: unicode2buckwalter

hotexamples.com에서의 예제들: 2

Python unicode2buckwalter - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 on.common.util.unicode2buckwalter에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: log.py 프로젝트: liushui9404/coref-1

def _write_reject(where, dropped_from, errcomms, opcode, rejection):
    """
    where is either:
      ['fname', fname]
    or
      ['docid', document_id, data_sort]

    errcoms is a list in the form:
      [[errcode, comments], e... ]

    where 'errcode' and 'comments' are defined as:

      comments -- list of comment lines.  If there are literal
                  newlines in a comment line, that's fine and new line
                  will be indented.  Comments may be the empty list
                  for one or all errorcodes.

      errcode  -- a short string to be looked up in the ERRS table.

    """

    if False:
        if where[0] == "fname":
            fname_base = where[1]
        else:
            assert where[0] == "docid"
            document_id, data_sort = where[1:]
            try:
                fname_base = output_file_name(document_id, data_sort)
            except IndexError:
                print document_id, data_sort
                raise
            try:
                mkdirs(os.path.dirname(fname_base))
            except Exception:
                pass  # already exists

        fname = fname_base + ".rejects"

        with codecs.open(fname, "a", "utf8") as outf:
            errnums = []

            try:
                for errcode, comments in errcomms:
                    errnum = "%s%s" % (ERRS[dropped_from][0],
                                       ERRS[dropped_from][1][errcode][0])
                    errmsg = ERRS[dropped_from][1][errcode][1]
                    errnums.append(errnum)
                    outf.write("; %s %s\n" % (errnum, errmsg))
                    for line in comments:
                        line = unicode2buckwalter(line)

                        indent = " " * 6
                        try:
                            line = indent + line.replace(
                                "\n", "\n" + indent + "   ")
                        except Exception:
                            print "%r" % line
                            raise
                        for subline in line.split("\n"):
                            outf.write("; %s\n" % subline)
            except ValueError:
                pprint.pprint(errcomms)
                raise

            outf.write("%s %s %s\n;\n;\n" %
                       (opcode, ",".join(errnums), rejection))

예제 #2

파일 보기

파일: callisto_converter.py 프로젝트: Varal7/ontonotes-tools-py3

def callisto_to_sgml(fname,
                     out_sgml=None,
                     buckit=False,
                     language="unknown",
                     wrap=True):
    """
    given the fname of a callisto xml file, produce either fname.coref
    or fname.name depending on whether the file represents name or
    coref annotation.

     if buckit, then run everything through unicode2buckwalter before writing out

    if wrap, wrap the whole thing in <DOC ...> </DOC>

    """

    try:
        from on.common.util import unicode2buckwalter, desubtokenize_annotations
    except ImportError:
        raise OnCommonUtilNeededError("callisto_to_sgml")

    document_id, annotation_opens, annotation_closes, source_text_raw = parse_callisto_xml(
        fname)

    filename = None

    source_text = list(source_text_raw)

    names = corefs = 0

    for open_annotation in annotation_opens:
        idx = open_annotation[0]
        if open_annotation[1] == "name":
            open_annotation_str = "<%s>" % open_annotation[2]
            names += 1
        else:
            open_annotation_str = '<COREF-ID="%s"-TYPE="%s"%s>' % (
                open_annotation[2], open_annotation[3], '-SUBTYPE="%s"' %
                open_annotation[4] if open_annotation[4] else "")
            corefs += 1

        source_text[idx] = "%s%s" % (open_annotation_str, source_text[idx])

    for close_annotation in annotation_closes:
        idx = close_annotation[0]
        if close_annotation[1] == "name":
            close_annotation_str = "</%s>" % close_annotation[2]
        else:
            close_annotation_str = "</COREF>"

        source_text[idx] = "%s%s" % (close_annotation_str, source_text[idx])

    if corefs and not names:
        ext = "coref"
    elif names and not corefs:
        ext = "name"
    elif names and corefs:
        raise FileContainsBothNameAndCorefAnnotationException(fname)
    else:
        raise NoAnnotationFoundException(fname)

    filename = out_sgml if out_sgml else fname + "." + ext

    with codecs.open(filename, "w", "utf8") as out_f:
        if wrap:
            out_f.write('<DOC DOCNO="%s">\n' % document_id)

        n_text = "".join(source_text)

        try:
            n_text, num_fixed = desubtokenize_annotations(
                n_text, add_offset_notations=True)
        except Exception as e:
            raise DeSubtokenizationFailedException(fname, e)

        if buckit:
            n_text = unicode2buckwalter(n_text, sgml_safe=True)

        n_text = n_text.replace("\r", " ")

        n_text = n_text.strip()
        out_f.write(n_text)
        out_f.write("\n")

        if wrap:
            out_f.write('</DOC>\n')