def extract(filename,text): biblio = {} root = st.parse(text) header = root.find("teiHeader") biblio["filename"] = filename for author in bib_metadata["auth_name"]: if header.findtext(author): biblio["author"] = header.findtext(author) break for auth_date in bib_metadata["auth_dates"]: if header.findtext(auth_date): biblio["auth_date"] = header.findtext(auth_date) break for gender in bib_metadata["auth_gender"]: if header.findtext(gender): biblio["gender"] = header.findtext(gender) break for title in bib_metadata["titles"]: if header.findtext(title): biblio["title"] = header.findtext(title) break for cr_date in bib_metadata["createdate"]: if header.findtext(cr_date): biblio["date"] = int(header.findtext(cr_date)) break for genre in bib_metadata["text_genre"]: if header.findtext(genre): biblio["genre"] = header.findtext(genre) break for publisher in bib_metadata["publishers"]: if header.findtext(publisher): biblio["publisher"] = header.findtext(publisher) break for pub_place in bib_metadata["pub_places"]: if header.findtext(pub_place): biblio["pub_place"] = header.findtext(pub_place) break return biblio
def extract(filename, text): biblio = {} root = st.parse(text) header = root.find("teiHeader") biblio["filename"] = filename for author in bib_metadata["auth_name"]: if header.findtext(author): biblio["author"] = header.findtext(author) break for auth_date in bib_metadata["auth_dates"]: if header.findtext(auth_date): biblio["auth_date"] = header.findtext(auth_date) break for gender in bib_metadata["auth_gender"]: if header.findtext(gender): biblio["gender"] = header.findtext(gender) break for title in bib_metadata["titles"]: if header.findtext(title): biblio["title"] = header.findtext(title) break for cr_date in bib_metadata["createdate"]: if header.findtext(cr_date): biblio["date"] = int(header.findtext(cr_date)) break for genre in bib_metadata["text_genre"]: if header.findtext(genre): biblio["genre"] = header.findtext(genre) break for publisher in bib_metadata["publishers"]: if header.findtext(publisher): biblio["publisher"] = header.findtext(publisher) break for pub_place in bib_metadata["pub_places"]: if header.findtext(pub_place): biblio["pub_place"] = header.findtext(pub_place) break return biblio
#!/usr/bin/env python import philologic.shlaxtree as st import sys import codecs for filename in sys.argv[1:]: file = codecs.open(filename, "r", "utf-8") root = st.parse(file) header = root.find("teiHeader") print st.et.tostring(header) print header.findtext(".//titleStmt/title") print header.findtext(".//titleStmt/author")
#!/usr/bin/env python import philologic.shlaxtree as st import sys import codecs for filename in sys.argv[1:]: file = codecs.open(filename,"r","utf-8") root = st.parse(file) header = root.find("teiHeader") print st.et.tostring(header) print header.findtext(".//titleStmt/title") print header.findtext(".//titleStmt/author")