Exemplo n.º 1
0
def get_book_from_wiki():
    book_no = 0
    wiki_no = 0
    with open(pjoin(data_folder, 'list_wiki'),  'w', encoding='utf-8') as f_out:
        for century, book_title, text in books():
            book_no += 1
            print(book_no)
            cur_book_info_url = book_info_url % book_title
            content = requests.get(cur_book_info_url).text
            if 'Wikisource' in content:
                wiki_no += 1
                print(wiki_no, book_title)
                f_out.write(book_title + '\n')
Exemplo n.º 2
0
def get_font_info():
    book_no = 0
    with open(pjoin(data_folder, 'list_font'),  'w', encoding='utf-8') as f_out:
        for century, book_title, text in books():
            book_no += 1
            print(book_no)
            cur_book_info_url = book_info_url % book_title
            content = requests.get(cur_book_info_url).text
            cur_html = BeautifulSoup(content, 'html')
            all_tds = cur_html.body.find_all('td')
            num_tds = len(all_tds)
            for i in range(num_tds):
                if all_tds[i].text == 'Schriftart:':
                    f_out.write(all_tds[i+1].text + '\n')
                    break
Exemplo n.º 3
0
def download_xml():
    with open(pjoin(data_folder, 'list_urls'), encoding='utf-8') as f_:
        for century, book_title, text in books():
            cur_xml_name = book_title + '.TEI-P5.ling.xml'
            if not os.path.exists(
                    pjoin(data_folder, 'dta-lingattr-tei_2019-02-06',
                          cur_xml_name)):
                print(book_title)
                cur_xml_url = xml_url % book_title
                subprocess.run('wget -O %s \'%s\'' %
                               (pjoin(data_folder, 'all_xml', century,
                                      cur_xml_name), cur_xml_url),
                               shell=True)
            else:
                move(
                    pjoin(data_folder, 'dta-lingattr-tei_2019-02-06',
                          cur_xml_name),
                    pjoin(data_folder, 'all_xml', century, cur_xml_name))
Exemplo n.º 4
0
def get_annotations():
    num_wiki = 0
    num_annot = 0
    with open(pjoin(data_folder, 'list_wiki_TEI'), 'w', encoding='utf-8') as f_out:
        for century, book_title, text in books():
            book_path = pjoin(data_folder, 'all_xml', century, '%s.TEI-P5.ling.xml' % book_title)
            if not os.path.exists(book_path):
                continue
            with open(book_path, encoding='utf-8') as f_:
                content = f_.read()
                if 'a href="http://de.wikisource.org/wiki/Wikisource' in content:
                    num_wiki += 1
                    print(book_path)
                    f_out.write(century + '\t' + book_title + '\n')
                else:
                    num_annot += 1

        print(num_wiki, num_annot)
Exemplo n.º 5
0
def get_cmds():
    i = 0
    with open(pjoin(data_folder, 'download_cmd'), 'w',
              encoding='utf-8') as f_out:
        for century, book_title, text in books():
            print(i)
            i += 1
            book_folder = pjoin(data_folder, 'all_image', century, book_title)
            cur_book_page_url = book_page_url % book_title
            content = requests.get(cur_book_page_url).text
            cur_html = BeautifulSoup(content, 'html')
            cur_pages = [
                ele.text[:4] for ele in cur_html.body.find_all('option')
            ]
            for page in cur_pages:
                cur_image_url = image_url % (book_title, book_title, page)
                download_cmd = 'wget -O %s \'%s\'' % (pjoin(
                    book_folder, page + '.png'), cur_image_url)
                f_out.write(download_cmd + '\n')
Exemplo n.º 6
0
def make_book_folder():
    for century, url, text in books():
        book_title = url[len('http://www.deutschestextarchiv.de/book/show/'):]
        book_folder = pjoin(data_folder, 'all_image', century, book_title)
        if not os.path.exists(book_folder):
            os.makedirs(book_folder)