Пример #1
0
def createJsonRepresentation(app):
    unit = getUnit(app)
    root = {}
    allWits = []
    rdgs = [el for el in app.childNodes if el.nodeType == 1]
    for rdg in rdgs:
        appLevel = {}
        appLevel['id'] = rdg.getAttribute('wit')
        tokenList = []
        ws = rdg.getElementsByTagName('w')
        for ind, w in enumerate(ws):
            if not 3 in [child.nodeType for child in w.childNodes]:
                continue
            currentWord = w
            if ind == 0:
                previousWord = ''
            else:
                previousWord = ws[ind-1]
            token = {}
            token['t'] = currentWord.toxml()[8 + len(w.getAttribute('n')):-4]
            token['n'] = Preprocessing.conflate(currentWord)
            token['u'] = unit
            tokenList.append(token)
        appLevel['tokens'] = tokenList
        allWits.append(appLevel)
    root['witnesses'] = allWits
    return json.loads(json.dumps(root))
Пример #2
0
    docLevel['id'] = afile
    tokenList = []
    if debug:
        html.write('<h2>' + afile + '</h2><table border = "1"><th>Original<th>Conflated</th>')
    ws = minidom.parse(os.path.join(path, afile)).getElementsByTagName('w')
    words = []
    for w in range(len(ws)):
        if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w
            continue
        currentWord = ws[w]
        previousWord = ''
        try:
            previousWord = ws[w-1]
        except IndexError:
            pass
        token = {}
        token['t'] = currentWord.toxml()[8+len(currentWord.getAttribute('n')):-4]
        c = Preprocessing.conflate(currentWord)
        if c == Preprocessing.conflate(previousWord):
            c += '1' # tag '1' to the end of a wod that we suspect is repeated in the manuscript.
        token['n'] = c
        token['u'] = unit
        words.append(c)
        tokenList.append(token)
    docLevel['tokens'] = tokenList
    alldocs.append(docLevel)
root['witnesses'] = alldocs
with open(os.path.join(path, jsonFileName), 'w') as Json:
    Json.write(json.dumps(root, ensure_ascii=False).encode('utf-8'))
print 'Took', datetime.datetime.now()-startTimeX2J, 'to execute XMLsToJSON.py'
Пример #3
0
    root = {}
    alldocs = []
    rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']]
    for rdg in rdgs:
        docLevel = {}
        docLevel['id'] = rdg.getAttribute('wit')
        tokenList = []
        ws = rdg.getElementsByTagName('w')
        words = []
        for w in range(len(ws)):
            if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w
                continue
            currentWord = ws[w]
            previousWord = ''
            try:
                previousWord = ws[w-1]
            except IndexError:
                pass
            token = {}
            token['t'] = currentWord.toxml()[8 + len(ws[w].getAttribute('n')):-4]
            c = Preprocessing.conflate(currentWord)
            token['n'] = c
            token['u'] = unit
            words.append(c)
            tokenList.append(token)
        docLevel['tokens'] = tokenList
        alldocs.append(docLevel)
    root['witnesses'] = alldocs
    with open(os.path.join(path, afile[:-3] + 'json'), 'w') as Json:
        Json.write(json.dumps(root, ensure_ascii=False).encode('utf-8'))