示例#1
0
文件: corpus.py 项目: chloebt/educe
def write_annotation_file(anno_filename, doc):
    """
    Write a GlozzDocument to XML in the given path
    """
    glozz.write_annotation_file(anno_filename,
                                doc,
                                settings=STAC_OUTPUT_SETTINGS)
示例#2
0
文件: corpus.py 项目: tjane/educe
def write_annotation_file(anno_filename, doc):
    """
    Write a GlozzDocument to XML in the given path
    """
    glozz.write_annotation_file(anno_filename,
                                doc,
                                settings=STAC_OUTPUT_SETTINGS)
示例#3
0
def write_glozz(gdoc, path_stub):
    ac_path   = path_stub + '.ac'
    aa_path   = path_stub + '.aa'

    gdoc_bytes = gdoc.text().encode('utf-8')
    with open(ac_path, 'wb') as ac_f:
        ac_f.write(gdoc_bytes)

    gdoc.hashcode = glozz.hashcode(StringIO.StringIO(gdoc_bytes))
    glozz.write_annotation_file(aa_path, gdoc)
示例#4
0
文件: output.py 项目: tjane/educe
def save_document(output_dir, k, doc):
    """
    Save a document as a Glozz .ac/.aa pair
    """
    stub = output_path_stub(output_dir, k)
    mk_parent_dirs(stub)
    doc_bytes = doc.text().encode('utf-8')
    is_unannotated = k.stage == 'unannotated'

    # .aa file
    settings = stac_unannotated_output_settings\
        if is_unannotated else stac_output_settings
    out_doc = copy.copy(doc)
    out_doc.hashcode = glozz.hashcode(BytesIO(doc_bytes))
    glozz.write_annotation_file(stub + ".aa", out_doc, settings=settings)

    # .ac file
    if is_unannotated:
        with open(stub + ".ac", 'wb') as fout:
            fout.write(doc_bytes)
def main():

    #ligne de commande : python nonling_annotations-v2.py ../../data/pilotnonling/test/

    def to_annotate(fileId):
        stage = fileId.stage
        return stage == 'units' or stage == 'discourse'

    parser = argparse.ArgumentParser()
    parser.add_argument('Directory', help = 'directory where the files to annotate are')
    args = parser.parse_args()
    Directory = args.Directory

    reader = STAC.Reader(Directory)
    subset = reader.filter(reader.files(), lambda k: to_annotate(k))
    corpus= reader.slurp(subset, verbose=True)

    for key in corpus.keys():
        doc = corpus[key]
        data = str(key).split(' ')
        game = data[0]
        part = data[1][1:-1] #the integer that interests us is between brackets
        stage = data[2]
        metal = data[3]
        path = Directory + game + '/' + stage + '/' + metal + '/'
        print(game, part, stage, metal)
        if stage == 'units':
            newdoc = add_units_annotations(doc)
            GLOZZ.write_annotation_file(path + game + '_' + part + '.aa', newdoc)
            continue
        elif stage == 'discourse':
            newdoc = add_discourse_annotations(doc)
            GLOZZ.write_annotation_file(path + game + '_' + part + '.aa', newdoc)
            continue
        else:
            raise Exception("main : you shouldn't be here!")
            continue
示例#6
0
文件: __init__.py 项目: arne-cl/educe
def write_annotation_file(anno_filename, doc):
    """
    Write a GlozzDocument to XML in the given path
    """
    glozz.write_annotation_file(anno_filename, doc, settings=stac_output_settings)