Exemplo n.º 1
0
def epub2csv(filename):

    if not os.path.isabs(filename):
        filename = os.path.abspath(filename)

    epub = zipfile.ZipFile(filename)

    metaDom = xml.dom.minidom.parseString(epub.open("META-INF/container.xml").read())

    opsFile = metaDom.getElementsByTagName("rootfile")[0].getAttributeNode("full-path").value

    opsDom = xml.dom.minidom.parseString(epub.open(opsFile).read())

    section = 0

    allWords = []

    try:
        if opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:title"):
            title = opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:title")[0].childNodes[0].data
        else:
            title = os.path.basename(filename)
        if opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:language"):
            language = opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:language")[0].childNodes[0].data
        else:
            language = "BG"
    except:
        ipshell()

    ncxFile = None

    for element in opsDom.getElementsByTagName("manifest")[0].getElementsByTagName("item"):
        if element.getAttribute("id") == "ncx":
            ncxFile = element.getAttribute("href")
            break
    if ncxFile:
        ncxDom = xml.dom.minidom.parseString(epub.open(os.path.join(os.path.dirname(opsFile), ncxFile)).read())

    fileList = []

    for chapter in opsDom.getElementsByTagName("spine")[0].getElementsByTagName("itemref"):
        section = section + 1
        for element in opsDom.getElementsByTagName("manifest")[0].getElementsByTagName("item"):
            if element.getAttribute("id") == chapter.getAttributeNode("idref").value:
                chapterFilename = element.getAttribute("href")
        if ncxFile:
            for element in ncxDom.getElementsByTagName("navMap")[0].getElementsByTagName("navPoint"):
                if element.getElementsByTagName("content")[0].getAttribute("src") == chapterFilename:
                    chapterName = element.getElementsByTagName("navLabel")[0].getElementsByTagName("text")[0].childNodes[0].data
        chapterText = epub.open(os.path.join(os.path.dirname(opsFile), chapterFilename)).read()
        soup = BeautifulSoup.BeautifulSoup(chapterText)
        body_text = ''.join(soup.body(text = True))
        freqency = wordList.makeFreqFromText(body_text, allWords)
        #print allWords
        #print freqency.keys()
        allWords = list(set(list(chain.from_iterable([ allWords, freqency.keys()]))))
        # pprint(allWords)
        wordList.createChapterFile(filename + ".cards/{:02d} - ".format(section) + chapterFilename + '.csv', freqency)
        fileList.append((filename + ".cards/{:02d} - ".format(section) + chapterFilename + '.csv', language, title, "{:02d} - ".format(section) + chapterName))
    return fileList
Exemplo n.º 2
0
def txt2csv(filename, chapRegEx):
    if not os.path.isabs(filename):
        filename = os.path.abspath(filename)

    f = codecs.open(filename, "r", "utf-8")

    text = f.read()

    chapBoundry = re.compile(chapRegEx, re.UNICODE)

    allWords = [""]

    fileList = []

    for chapter in zip(chapBoundry.split(text)[1::2], chapBoundry.split(text)[2::2]):
        freqency = wordList.makeFreqFromText(chapter[1], allWords)
        # TODO: Fix capitals for names
        allWords = list(set(list(chain.from_iterable([allWords, freqency.keys()]))))
        wordList.createChapterFile(filename + "{:02d}.csv".format(int(chapter[0])), freqency)
        fileList.append(
            (
                filename + "{:02d}.csv".format(int(chapter[0])),
                "BG",
                os.path.basename(filename),
                "{:02d}".format(int(chapter[0])),
            )
        )
    return fileList
Exemplo n.º 3
0
def txt2csv(filename, chapRegEx):
    if not os.path.isabs(filename):
        filename = os.path.abspath(filename)

    f = codecs.open(filename, 'r', 'utf-8')

    text = f.read()

    chapBoundry = re.compile(chapRegEx, re.UNICODE)

    allWords = ['', ]

    fileList = []

    for chapter in zip(chapBoundry.split(text)[1::2], chapBoundry.split(text)[2::2]):
        freqency = wordList.makeFreqFromText(chapter[1], allWords)
        # TODO: Fix capitals for names
        allWords = list(set(list(chain.from_iterable([ allWords, freqency.keys()]))))
        wordList.createChapterFile(filename + "{:02d}.csv".format(int(chapter[0])), freqency)
        fileList.append((filename + "{:02d}.csv".format(int(chapter[0])), "BG", os.path.basename(filename), "{:02d}".format(int(chapter[0]))))
    return fileList
Exemplo n.º 4
0
def epub2csv(filename):

    if not os.path.isabs(filename):
        filename = os.path.abspath(filename)

    epub = zipfile.ZipFile(filename)

    metaDom = xml.dom.minidom.parseString(epub.open("META-INF/container.xml").read())

    opsFile = metaDom.getElementsByTagName("rootfile")[0].getAttributeNode("full-path").value

    opsDom = xml.dom.minidom.parseString(epub.open(opsFile).read())

    section = 0

    allWords = []

    try:
        if opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:title"):
            title = opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:title")[0].childNodes[0].data
        else:
            title = os.path.basename(filename)
        if opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:language"):
            language = (
                opsDom.getElementsByTagName("metadata")[0].getElementsByTagName("dc:language")[0].childNodes[0].data
            )
        else:
            language = "BG"
    except:
        ipshell()

    ncxFile = None

    for element in opsDom.getElementsByTagName("manifest")[0].getElementsByTagName("item"):
        if element.getAttribute("id") == "ncx":
            ncxFile = element.getAttribute("href")
            break
    if ncxFile:
        ncxDom = xml.dom.minidom.parseString(epub.open(os.path.join(os.path.dirname(opsFile), ncxFile)).read())

    fileList = []

    for chapter in opsDom.getElementsByTagName("spine")[0].getElementsByTagName("itemref"):
        section = section + 1
        for element in opsDom.getElementsByTagName("manifest")[0].getElementsByTagName("item"):
            if element.getAttribute("id") == chapter.getAttributeNode("idref").value:
                chapterFilename = element.getAttribute("href")
        if ncxFile:
            for element in ncxDom.getElementsByTagName("navMap")[0].getElementsByTagName("navPoint"):
                if element.getElementsByTagName("content")[0].getAttribute("src") == chapterFilename:
                    chapterName = (
                        element.getElementsByTagName("navLabel")[0].getElementsByTagName("text")[0].childNodes[0].data
                    )
        chapterText = epub.open(os.path.join(os.path.dirname(opsFile), chapterFilename)).read()
        soup = BeautifulSoup.BeautifulSoup(chapterText)
        body_text = "".join(soup.body(text=True))
        freqency = wordList.makeFreqFromText(body_text, allWords)
        # print allWords
        # print freqency.keys()
        allWords = list(set(list(chain.from_iterable([allWords, freqency.keys()]))))
        # pprint(allWords)
        wordList.createChapterFile(filename + ".cards/{:02d} - ".format(section) + chapterFilename + ".csv", freqency)
        fileList.append(
            (
                filename + ".cards/{:02d} - ".format(section) + chapterFilename + ".csv",
                language,
                title,
                "{:02d} - ".format(section) + chapterName,
            )
        )
    return fileList