/
div1_div2.py
145 lines (117 loc) · 4.04 KB
/
div1_div2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
""" Div1 / Div2 old Perseus data to Div/Div + RefsDecl converter
Authors : Aaron Plasek, Ariane Pinche, Mark Moll, Ana Migowski
Adaptation : Thibault Clérice
Python 3 Script
Description :
This software will transform old Perseus files into CTS compliant files if their structure is div1/div2 based
Example of file needing this :
https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/598aea1eb719be1709f720839e4428a087e43ad6/data/tlg0612/tlg001/tlg0612.tlg001.perseus-grc1.xml
Example of output :
Syntax :
python3 div1_div2.py [Url of the original file on raw.github] [URN] [lang]
Requires :
- requests
- lxml
- MyCapytain
pip install ...
"""
# Import command line informations
from sys import argv
# Import required library
from lxml import etree
import requests
# Import library for CTS
import MyCapytain.resources.texts.tei
import common
def transform(url):
""" Download an xml file and add line numbering and ctsize it
:param url: A Perseus Github Raw address
:type url: str
:param urn: The urn of the text
:type urn: str
:param lang: Iso code for lang
:type lang: str
"""
lang, urn, target, parsed = common.parse(url)
if "grc" not in urn and "lat" not in urn:
type_text = "translation"
else:
type_text = "edition"
# We find divs called div1
div1_group = parsed.xpath("//div1")
i = 1
for div1 in div1_group:
# We change it's tag
div1.tag = "div"
# To deal with different subtype, we get the former attribute value of type and put it to subtype
div1_subtype = div1.get("type")
div1.set("subtype", div1_subtype)
div1.set("type", "textpart")
if "n" not in dict(div1.attrib):
div1.set("n", str(i))
i += 1
"""
Change div2 to div, moving their @type to @subtype
"""
# We find divs called div2
i = 1
div2_group = parsed.xpath("//div2")
for div2 in div2_group:
# We change it's tag
div2.tag = "div"
# To deal with different subtype, we get the former attribute value of type and put it to subtype
div2_subtype = div2.get("type")
div2.set("subtype", div2_subtype)
div2.set("type", "textpart")
if "n" not in dict(div2.attrib):
div2.set("n", str(i))
i += 1
"""
Change div3 to div, moving their @type to @subtype
"""
# We find divs called div2
i = 1
div3_group = parsed.xpath("//div3")
for div3 in div3_group:
# We change it's tag
div3.tag = "div"
# To deal with different subtype, we get the former attribute value of type and put it to subtype
div3_subtype = div3.get("type")
div3.set("subtype", div3_subtype)
div3.set("type", "textpart")
if "n" not in dict(div3.attrib):
div3.set("n", str(i))
i += 1
"""
Add refsDecl information for CTS
"""
citations = []
# Used only if div3 > 0
if len(div3_group) > 0:
citations.append(
MyCapytain.resources.texts.tei.Citation(
name=div3_subtype,
refsDecl="/tei:TEI/tei:text/tei:body/tei:div[@type='"+type_text+"']/tei:div[@n='$1']/tei:div[@n='$2']/tei:div[@n='$3']"
)
)
# Used only if div2 > 0
if len(div2_group) > 0:
citations.append(
MyCapytain.resources.texts.tei.Citation(
name=div2_subtype,
refsDecl="/tei:TEI/tei:text/tei:body/tei:div[@type='"+type_text+"']/tei:div[@n='$1']/tei:div[@n='$2']"
)
)
citations.append(
MyCapytain.resources.texts.tei.Citation(
name=div1_subtype,
refsDecl="/tei:TEI/tei:text/tei:body/tei:div[@type='"+type_text+"']/tei:div[@n='$1']"
)
)
try:
common.write_and_clean(urn, lang, parsed, citations, target)
except Exception as E:
print(urn + " failed")
print(E)
if __name__ == '__main__':
transform(*tuple(argv[1:]))