def get_book_from_wiki(): book_no = 0 wiki_no = 0 with open(pjoin(data_folder, 'list_wiki'), 'w', encoding='utf-8') as f_out: for century, book_title, text in books(): book_no += 1 print(book_no) cur_book_info_url = book_info_url % book_title content = requests.get(cur_book_info_url).text if 'Wikisource' in content: wiki_no += 1 print(wiki_no, book_title) f_out.write(book_title + '\n')
def get_font_info(): book_no = 0 with open(pjoin(data_folder, 'list_font'), 'w', encoding='utf-8') as f_out: for century, book_title, text in books(): book_no += 1 print(book_no) cur_book_info_url = book_info_url % book_title content = requests.get(cur_book_info_url).text cur_html = BeautifulSoup(content, 'html') all_tds = cur_html.body.find_all('td') num_tds = len(all_tds) for i in range(num_tds): if all_tds[i].text == 'Schriftart:': f_out.write(all_tds[i+1].text + '\n') break
def download_xml(): with open(pjoin(data_folder, 'list_urls'), encoding='utf-8') as f_: for century, book_title, text in books(): cur_xml_name = book_title + '.TEI-P5.ling.xml' if not os.path.exists( pjoin(data_folder, 'dta-lingattr-tei_2019-02-06', cur_xml_name)): print(book_title) cur_xml_url = xml_url % book_title subprocess.run('wget -O %s \'%s\'' % (pjoin(data_folder, 'all_xml', century, cur_xml_name), cur_xml_url), shell=True) else: move( pjoin(data_folder, 'dta-lingattr-tei_2019-02-06', cur_xml_name), pjoin(data_folder, 'all_xml', century, cur_xml_name))
def get_annotations(): num_wiki = 0 num_annot = 0 with open(pjoin(data_folder, 'list_wiki_TEI'), 'w', encoding='utf-8') as f_out: for century, book_title, text in books(): book_path = pjoin(data_folder, 'all_xml', century, '%s.TEI-P5.ling.xml' % book_title) if not os.path.exists(book_path): continue with open(book_path, encoding='utf-8') as f_: content = f_.read() if 'a href="http://de.wikisource.org/wiki/Wikisource' in content: num_wiki += 1 print(book_path) f_out.write(century + '\t' + book_title + '\n') else: num_annot += 1 print(num_wiki, num_annot)
def get_cmds(): i = 0 with open(pjoin(data_folder, 'download_cmd'), 'w', encoding='utf-8') as f_out: for century, book_title, text in books(): print(i) i += 1 book_folder = pjoin(data_folder, 'all_image', century, book_title) cur_book_page_url = book_page_url % book_title content = requests.get(cur_book_page_url).text cur_html = BeautifulSoup(content, 'html') cur_pages = [ ele.text[:4] for ele in cur_html.body.find_all('option') ] for page in cur_pages: cur_image_url = image_url % (book_title, book_title, page) download_cmd = 'wget -O %s \'%s\'' % (pjoin( book_folder, page + '.png'), cur_image_url) f_out.write(download_cmd + '\n')
def make_book_folder(): for century, url, text in books(): book_title = url[len('http://www.deutschestextarchiv.de/book/show/'):] book_folder = pjoin(data_folder, 'all_image', century, book_title) if not os.path.exists(book_folder): os.makedirs(book_folder)