Exemplo n.º 1
0
 def test_no_redirect(self):
     cache = TestCache()
     cache.put('https://www.example.com/blog', self.NON_INDEX)
     ebook = Ebook('https://www.example.com/blog', 10, cache)
     try:
         ebook.assemble()
         self.fail('Expected ebook.assemble to throw')
     except ListingNotFoundError, expected:
         self.assertEqual('No listing filter found', expected.message)
Exemplo n.º 2
0
class EbookTest(unittest.TestCase):
    def setUp(self):
        path_to_text = os.path.join(THIS_DIR, "test_data", "zen_en.txt")
        self.ebook = Ebook(path_to_text)

    def test_1_read(self):
        self.ebook.read()
        self.assertIsInstance(self.ebook.content, str)

    def test_2_split_book_into_sentences(self):
        self.ebook.split_into_sentences()
        self.assertIsInstance(self.ebook.sentences, list)
Exemplo n.º 3
0
	def make_ebook(self):
		"""Makes an ebook object by iterating through each chapter url, converting it in to a chapter object, and appending it to the ebook's chapterlist"""
		ebook = Ebook(self.title)
		for link in self.get_chapter_links():
			chapter = self.makeChapter(link)
			ebook.chapters.append(chapter)
		return ebook
Exemplo n.º 4
0
def read_docs_folder(root):
    extensions = [
        'pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw1', 'azw', 'manga', 'azw2',
        'azw3'
    ]
    file_paths = []

    for extension in extensions:
        file_paths += glob.glob(root + "/documents/**/*.{}".format(extension),
                                recursive=True)

    output = []

    for file_path in file_paths:
        e = Ebook(file_path)
        file_properties = {}
        file_properties["path"] = file_path
        file_properties["name"] = os.path.basename(file_path)
        file_properties["canonic_filename"] = e.path
        file_properties["hash"] = e.hash
        file_properties["asin"] = e.asin
        file_properties["processed"] = False

        output += [file_properties]

    return output
Exemplo n.º 5
0
class TestSequenceFunctions(unittest.TestCase):

    def setUp(self):
        self.book = Ebook()

    def test_1_line(self):
        text = "one line of text"
        self.assertEqual(self.book.escape_text_to_html(text),
                        text)

    def test_2_line(self):
        text = "one line of text\nsecond line"
        self.assertEqual(self.book.escape_text_to_html(text),
                        text)

    def test_unicode(self):
        text = u"§"
        self.assertEqual(self.book.escape_text_to_html(text),
                        "§")

    def test_greater_than_and_less_than(self):
        text = "0 < 1 and 2 > 1"
        self.assertEqual(self.book.escape_text_to_html(text),
                        "0 &lt; 1 and 2 &gt; 1")

    def test_html_tags(self):

        text = "<b>bold</b> plain text"
        self.assertEqual(self.book.escape_text_to_html(text), text)

    def test_more_html_tags(self):
        text = "<i>bold</i> plain text"
        self.assertEqual(self.book.escape_text_to_html(text),
                         text)

        text = "<b>bold</b> &#x300; plain text"
        self.assertEqual(self.book.escape_text_to_html(text),
                         text)

    def test_escaped_chars_are_ignored(self):

        text = r"\&amp;"
        self.assertEqual(self.book.escape_text_to_html(text),
                         "&amp;amp;")

        text = r"\<b\> <b> \<b\> \<b>"
        self.assertEqual(self.book.escape_text_to_html(text),
                         "&lt;b&gt; <b> &lt;b&gt; &lt;b&gt;")
def create_collections_by_filesystem(root):
    all_dirs = glob.glob(root + "/documents/**/*/", recursive=True)
    toplevel_dirs = glob.glob(root + "/documents/*/")
    if all_dirs != toplevel_dirs:
        print("only single level of nesting supported currently")
        exit()

    collections = {}

    for directory in toplevel_dirs:
        # normpath to remove trailing slash
        directory_name = os.path.basename(os.path.normpath(directory))
        collection_name = directory_name + "@en-US"
        collections[collection_name] = {"items": [], "lastaccess": 0}
        filepaths = glob.glob(directory + "/*")
        for filepath in filepaths:
            e = Ebook(filepath)
            collections[collection_name]["items"] += [e.fileident()]

    return collections
Exemplo n.º 7
0
 def __init__(self,
              path,
              source_language,
              target_language,
              engine="Google"):
     self.source_language = source_language
     self.target_language = target_language
     self.ebook = Ebook(path)
     self.set_out_path()
     self.set_counter_path()
     self.set_start_point()
     self.translator = Translator(source_language, target_language, engine)
Exemplo n.º 8
0
    def run(self):
        parser = argparse.ArgumentParser()
        parser.add_argument("url", help="URL to download")
        parser.add_argument("--limit",
                            type=int,
                            default=0,
                            help="Max number of articles to download")
        parser.add_argument('--clean_cache', action='store_true')
        args = parser.parse_args()
        if not args.url:
            parser.print_help()
            exit(1)

        url = self._sanitize_url(args.url)

        if args.clean_cache:
            Cache(url).clean()
            return

        ebook = Ebook(url, args.limit, Cache(url))
        try:
            ebook.assemble()
            print("Wrote %s to %s" % (ebook.get_title(), ebook.get_filename()))
        except base.FilterNotFoundError, e:
            print("""
ERROR: Blook could not figure out how to parse {url}.

To add support for downloading this blog, please create an issue at
https://github.com/kchodorow/blook/issues with the following title:

    {msg} for {url}

Blook created a file called 'unparsable.html' in this directory, which contains
the HTML it didn't recognize. Please attach it to the GitHub issue.
""".format(url=url, msg=e.message))
Exemplo n.º 9
0
 def test_assemble(self):
     cache = TestCache()
     cache.put('https://www.example.com', self.INDEX)
     ebook = Ebook('https://www.example.com', 2, cache)
     ebook.assemble()
     self.assertEqual('Nhl', ebook.get_title())
     self.assertEqual('Nhl.epub', ebook.get_filename())
Exemplo n.º 10
0
class TestSequenceFunctions(unittest.TestCase):
    def setUp(self):
        self.book = Ebook()

    def test_1_line(self):
        text = "one line of text"
        self.assertEqual(self.book.escape_text_to_html(text), text)

    def test_2_line(self):
        text = "one line of text\nsecond line"
        self.assertEqual(self.book.escape_text_to_html(text), text)

    def test_unicode(self):
        text = u"§"
        self.assertEqual(self.book.escape_text_to_html(text), "&#167;")

    def test_greater_than_and_less_than(self):
        text = "0 < 1 and 2 > 1"
        self.assertEqual(self.book.escape_text_to_html(text),
                         "0 &lt; 1 and 2 &gt; 1")

    def test_html_tags(self):

        text = "<b>bold</b> plain text"
        self.assertEqual(self.book.escape_text_to_html(text), text)

    def test_more_html_tags(self):
        text = "<i>bold</i> plain text"
        self.assertEqual(self.book.escape_text_to_html(text), text)

        text = "<b>bold</b> &#x300; plain text"
        self.assertEqual(self.book.escape_text_to_html(text), text)

    def test_escaped_chars_are_ignored(self):

        text = r"\&amp;"
        self.assertEqual(self.book.escape_text_to_html(text), "&amp;amp;")

        text = r"\<b\> <b> \<b\> \<b>"
        self.assertEqual(self.book.escape_text_to_html(text),
                         "&lt;b&gt; <b> &lt;b&gt; &lt;b&gt;")
Exemplo n.º 11
0
def rename_files(root):
    all_files = glob.glob(root + "/documents/**/*.*", recursive=True)
    for filepath in all_files:
        ext = os.path.splitext(filepath)[1]
        directory = os.path.dirname(filepath)
        e = Ebook(filepath)
        if e.author:
            new_filename = u"[{}]-{}".format(e.author, e.title)
        else:
            new_filename = e.title
        #make the filename appropriate for FAT filesystem
        new_filename = new_filename.replace(" ", "_")
        new_filename = new_filename.replace("/", "_")
        new_filename = new_filename.replace("\\", "_")
        new_filename = new_filename.replace("*", "_")
        new_filename = new_filename.replace("?", "_")
        new_filename = new_filename.replace('"', "_")
        new_filename = new_filename.replace("'", "_")
        new_filename = new_filename.replace(":", "_")
        new_filename = new_filename.replace("|", "_")
        new_filename = new_filename.replace("!", "_")
        new_path = os.path.join(directory, new_filename + ext)
        os.rename(filepath, new_path)
Exemplo n.º 12
0
def download():

    ebook = Ebook(title)

    for link in links:
        print(link)
        chapter_name = link.get_text()
        page = requests.get(urllib.parse.urljoin(BASE_URL, link['href']))
        soup = BeautifulSoup(page.content,
                             'html.parser',
                             from_encoding="gb18030")

        # create chapter header
        chapter = Chapter(chapter_name)

        pageLink = soup.find('p', class_='pageLink')
        sub_pages = None
        if pageLink:
            sub_pages = pageLink.find_all('a', href=True)
            pageLink.extract()

        c = get_content(soup)

        # if there are sub pages
        if sub_pages:
            hrefs = [l['href'] for l in sub_pages]
            hrefs = list(set(hrefs))
            hrefs.sort()
            print(hrefs)

            for href in hrefs:

                sub_page = requests.get(urllib.parse.urljoin(BASE_URL, href))
                soup_link = BeautifulSoup(sub_page.content,
                                          'html.parser',
                                          from_encoding="gb18030")
                sub_cont = get_content(soup_link)
                c.append(sub_cont)
        chapter.set_content(str(c))
        ebook.add_chapter(chapter)

    ebook.save()
Exemplo n.º 13
0
def download():
    
    ebook = Ebook(title)

    for link in links:
        print(link)

        chapter_name = link.get_text()
        page = requests.get(urllib.parse.urljoin(BASE_URL, link['href']))
        soup = BeautifulSoup(page.content, 'html.parser', from_encoding="gb18030")

        # create chapter header
        chapter = Chapter(chapter_name)

        chapter_content = get_content(soup)

        chapter.set_content(chapter_content)
        ebook.add_chapter(chapter)

    ebook.save()
Exemplo n.º 14
0
from ebook import Ebook

k = Ebook('Lalka', 'Bolesław Prus', 668)
k.open()
k.next_page()
k.next_page()
k.next_page()
k.next_page()
k.show_status()
k.close()
Exemplo n.º 15
0
from ebook import Ebook

url = input('请输入需要下载的目录链接(https://www.zwdu.com/): ')

book = Ebook(url)
book.run()
Exemplo n.º 16
0
 def setUp(self):
     path_to_text = os.path.join(THIS_DIR, "test_data", "zen_en.txt")
     self.ebook = Ebook(path_to_text)
Exemplo n.º 17
0
 def setUp(self):
     self.book = Ebook()
Exemplo n.º 18
0
                    metavar='N',
                    type=str,
                    nargs='+',
                    help='path to infile txt')
parser.add_argument('--lang',
                    metavar='N',
                    type=str,
                    nargs='+',
                    help='path outfile html')

args = parser.parse_args()

assert args.input and args.lang, parser.description

path = args.input[0]
text = Ebook(path).sentences
translator = Translator(args.lang[0])
enumerated_sentences = [{
    "position": position,
    "sentence": sentence
} for position, sentence in enumerate(text)]


def translate(sen_dict):
    global total
    translated = translator.translate(sen_dict["sentence"])
    print(round(sen_dict["position"] / total * 100, 2), r"%")
    return {
        "position": sen_dict["position"],
        "original": sen_dict["sentence"],
        "translated": translated
Exemplo n.º 19
0
 def setUp(self):
     self.book = Ebook()