Пример #1
0
        if r.status != 200:
            if transCode != 'intb' or bookNo != 58 or chapterNo != 3:
                break

        content = r.read().decode(encoding)
        verses = verse_re.findall(content)

        if transCode == 'glu' and bookCodes[bookNo] == '3jo' and chapterNo == 1:
            verses = verses[:-1]

        for v, vn in izip(verses, count(1)):
            if int(v[0]) != chapterNo:
                raise Exception('strange chapterno at verse {}'.format(vn))
            if int(v[1]) != vn:
                raise Exception('strange verseno at verse {}'.format(vn))

        xml = create_xml([v[2] for v in verses])
        with open('{0}_{1:03}.html'.format(bookCodes[bookNo], chapterNo),
                  'w') as f:
            f.write(xml.encode('utf-8'))

        c.execute(
            "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)",
            (langCode, bookCodes[bookNo], chapterNo, xml))

        print '{0}|{1}|{2}'.format(bookCodes[bookNo], chapterNo, len(verses))
        sys.stdout.flush()

c.execute("vacuum")
db.commit()
Пример #2
0
            verses[33:33] = [('34', '34', u'——')]
        if book[0] == 'ac' and chapterNo == 24:
            verses[6:6] = [('7', '7', u'——')]
        if book[0] == 'ac' and chapterNo == 28:
            verses[28:28] = [('29', '29', u'——')]

        for v, vn in izip(verses, count(1)):
            if v[0] != v[1]:
                raise Exception('verse no not consistent: {0} != {1}'.format(
                    v[0], v[1]))
            if int(v[0]) != vn:
                raise Exception('invalid verse no: {0} (should be {1})'.format(
                    v[0], vn))

        xml = create_xml(
            v[2].replace('<br>', ' ').replace('\n', ' ').replace('  ', ' ')
            for v in verses)

        with open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w') as f:
            f.write(xml.encode('utf-8'))

        c.execute(
            "INSERT INTO html (langCode, bookCode, chapterno, html) VALUES (?, ?, ?, ?)",
            ('fi', book[0], chapterNo, xml))

        print '{0}|{1}|{2}'.format(book[0], chapterNo, len(verses))
        sys.stdout.flush()

c.execute('vacuum')
db.commit()
Пример #3
0
            found = verse_re.findall(content)

            if len(found) > 0:
                raw_verses += found
            else:
                break

        if len(raw_verses) == 0:
            break

        for i, rv in enumerate(raw_verses, 1):
            if int(rv[0]) != versao or \
               int(rv[1]) != bookNo or \
               int(rv[2]) != chapterNo or \
               int(rv[3]) != i:
                raise Exception('Strange verse no: {0}'.format(rv))

        verses = [rv[4].decode(encoding).strip() for rv in raw_verses]
        xml = create_xml(verses)

        c.execute(
            'INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)',
            (langCode, bookCode, chapterNo, xml))

        print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(raw_verses))
        sys.stdout.flush()

c.execute('VACUUM')
db.commit()
Пример #4
0
    chapterNo = 0
    expectedVerseNo = 1
    for rv in raw_verses:
        cn = int(rv[0])
        if cn != chapterNo:
            if cn != chapterNo + 1:
                raise Exception('Unexpected chapterNo: {0} {1}'.format(
                    bookCode, cn))
            chapterNo = cn
            expectedVerseNo = 1
            chapters.append([])

        vn = int(rv[1])
        if vn != expectedVerseNo:
            raise Exception('Unexpected verseNo: {0} {1}:{2}'.format(
                bookCode, cn, vn))
        expectedVerseNo += 1

        chapters[cn - 1].append(rv[2].replace('&nbsp;', ' ').strip())

    for c, cn in izip(chapters, count(1)):
        cursor.execute(
            "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)",
            ('s', bookCode, cn, create_xml(c)))
        print "{0:6}, {1:03}, {2:03}".format("'{0}'".format(bookCode), cn,
                                             len(c))

cursor.execute('VACUUM')
db.commit()
Пример #5
0
        chapter += 1

        url = '/bible/{0}/{1:02}/{2:02}/'.format(lang, bookNo, chapter)

        c = httplib.HTTPConnection('bibleonline.ru')
        c.request(
            'GET', url, '', {
                'User-Agent':
                'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.38 Safari/535.11',
                'Referer': 'http://bibleonline.ru/bible/rus/01/01/'
            })
        content = c.getresponse().read()

        verses = verse_re.findall(content)

        if len(verses) == 0: break

        for n, v in itertools.izip(itertools.count(1), verses):
            if int(v[0]) != n:
                raise Exception('Strange verse number: {0} {1}:{2}'.format(
                    bookCode, chapter, n))

        texts = [v[1].decode('utf-8') for v in verses]

        f = open('{0}/{1}_{2:03}.html'.format(lang, bookCode, chapter), 'w')
        f.write(createxml.create_xml(texts).encode('utf-8'))
        f.close()

        print '{0}|{1}|{2}'.format(bookCode, chapter, len(verses))
        sys.stdout.flush()
Пример #6
0
        verses = []
        for verseNo, verseElement in enumerate(verseElements, 1):
            if verseNo != int(verseElement.attributes['vnumber'].value):
                raise Exception('Invalid verse no {0} {1}:{2}'.format(
                    bookCode, chapterNo, verseNo))
            if verseElement.firstChild is not None and verseElement.firstChild.nextSibling is not None:
                raise Exception(
                    'Seems like child tag in VERS {0} {1}:{2}'.format(
                        bookCode, chapterNo, verseNo))

            if verseElement.firstChild is not None:
                verses.append(verseElement.firstChild.nodeValue)
            else:
                verses.append('')

        xml_text = create_xml(verses)

        # with open('{0}_{1}.html'.format(bookCode, chapterNo), 'w') as f:
        #     f.write(xml_text.encode('utf-8'))

        c.execute(
            "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)",
            (sys.argv[3], bookCode, chapterNo, xml_text))

        print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(verses))
        sys.stdout.flush()

c.execute('vacuum')
db.commit()
Пример #7
0
            text = text.replace('\n', ' ')
            if text.endswith('<br>'): text = text[:-4]
            if text.endswith('</p>'): text = text[:-4]
            text = text.strip()
            text = text.replace('<br>', '<br/>')

            withoutTags = re.sub(known_tags_re, '', text)
            if withoutTags.find('<') != -1:
                raise Exception('Unknown tag: {0} {1}:{2}'.format(
                    bookCode, chapterNo, verseNo))

            texts.append(text)

            #print chapterNo, verseNo, text

        html = create_xml(texts)
        try:
            xml.dom.minidom.parseString(html.encode('utf-8'))
        except Exception as e:
            print html
            raise Exception('Failed to parse {0} {1} {2}'.format(
                bookCode, chapterNo, e))

        c.execute(
            "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)",
            (langCode, bookCode, chapterNo, html))

        print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(verses))

c.execute('VACUUM')
c.close()
Пример #8
0
payload_re = re.compile(r'<span class="arabictext">([^<]*?)<br>')

http = httplib.HTTPConnection('copticchurch.net')

for book in books:
    c.execute(
        "SELECT count(*) FROM chapterSize WHERE transCode='{0}' AND bookCode=?"
        .format(transCode), (book[0], ))
    chapters = c.fetchall()[0][0]

    for chapterNo in xrange(1, chapters + 1):
        url = "/cgibin/bible/index.php?r={0}+{1}&version=SVD&showVN=1".format(
            engname[book[0]].replace(' ', '+'), chapterNo)

        http.request("GET", url)
        content = http.getresponse().read().decode('cp1256')
        payload = payload_re.search(content).group(1)

        verses = [
            v.strip().split(' ', 1)[1].strip()
            for v in payload.strip().split('\n')
        ]

        f = open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w')
        f.write(create_xml(verses, True).encode('utf-8'))
        f.close()

        print '{0}\t{1}\t{2}'.format(book[0], chapterNo, len(verses))
        sys.stdout.flush()
Пример #9
0
    chapterNo = int(parts[1])
    verseNo = int(parts[2])

    if bookNo != prevBook:
        content.append([[]])
    elif chapterNo != prevChapter:
        content[bookNo - 1].append([])

    content[bookNo - 1][chapterNo - 1].append(parts[3].decode('utf-8'))

    assert len(content) == bookNo
    assert len(content[bookNo - 1]) == chapterNo
    assert len(content[bookNo - 1][chapterNo - 1]) == verseNo

    prevBook = bookNo
    prevChapter = chapterNo

for bookNo, book in enumerate(content, 0):
    for chapterNo, chapter in enumerate(book, 1):
        xml = create_xml(chapter)

        c.execute(
            'INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)',
            (sys.argv[3], bookCodes[bookNo], chapterNo, xml))

        print '{0}|{1}|{2}'.format(bookCodes[bookNo], chapterNo, len(chapter))
        sys.stdout.flush()

c.execute('VACUUM')
db.commit()
Пример #10
0
        chapter += 1

        http.request('GET', '/main.php?menu=bible&act=1&nc=50&district={0}&chapter={1}'.format(
            bookNo, chapter
        ))

        content = http.getresponse().read()

        verses = verse_re.findall(content)

        if len(verses) == 0: break

        for n, v in izip(count(1), verses):
            if int(v[0]) != n: raise Exception('Strange verse number: {0} {1}:{2}'.format(bookCode, chapter, n))


        xml = create_xml(v[1].decode('windows-1251') for v in verses)

        c.execute("INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", (
            'bl', bookCode, chapter, xml
        ))
        c.execute('select count(*) from html')
        print c.fetchone()

        print '{0}|{1}|{2}'.format(
            bookCode, chapter, len(verses)
        )
        sys.stdout.flush()

c.execute('vacuum')
Пример #11
0
        print '{0:6}, {1:03}, {2:03}'.format("'{0}'".format(bookCodes[bookNo]),
                                             cn, v)

textKey = 'teks________'
textpos = content.find(textKey)
text = content[textpos + len(textKey) + 4:]

verses = text.split('\n')

db = sqlite3.Connection(transCode + '.sqlite')
c = db.cursor()

c.execute("DROP TABLE IF EXISTS html")
c.execute(
    "CREATE TABLE html (langCode, bookCode, chapterNo, html, PRIMARY KEY (langCode, bookCode, chapterNo))"
)

textpos = 0
for bookNo in xrange(66):
    for cn in range(chapterCount[bookNo]):
        xml = create_xml(
            v.decode('utf-8')
            for v in verses[textpos:textpos + verseCount[bookNo][cn]])
        textpos += verseCount[bookNo][cn]
        c.execute(
            "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)",
            (langCode, bookCodes[bookNo], cn + 1, xml))

c.execute('vacuum')
db.commit()
Пример #12
0
    "SELECT bookCode, bookName FROM books WHERE transCode='kjv' ORDER BY bookNo"
)
books = c.fetchall()

verse_re = re.compile(
    r"<a href=\"http://www.kingjamesbibleonline.org/[^\"]*?-(\d+?)/\" title='View more translations[^']*?'>(.*?)</a></p>"
)

for book in books:
    c.execute(
        "SELECT count(*) FROM chapterSize WHERE transCode='kjv' AND bookCode=?",
        (book[0], ))
    chapters = c.fetchall()[0][0]

    for chapterNo in xrange(1, chapters + 1):
        url = '/{0}-Chapter-{1}/'.format(book[1].replace(' ', '-'), chapterNo)

        http = httplib.HTTPConnection('www.kingjamesbibleonline.org')
        http.request("GET", url)
        response = http.getresponse()
        content = response.read()

        verses = [v[1] for v in verse_re.findall(content)]

        f = open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w')
        f.write(create_xml(verses))
        f.close()

        print '{0}\t{1}\t{2}'.format(book[0], chapterNo, len(verses))
        sys.stdout.flush()