Exemplo n.º 1
0
 def process_novel(self, novel: Novel, url: str):
     retry_time = 0
     while retry_time < self.retry_times:
         try:
             result = self._opener.open(url).read().decode()
         except Exception as e:
             print(e)
             continue
         # print(novel)
         novel.count = int(
             re.findall('<div>&nbsp;总字数:([0-9]*?)</div>', result).pop())
         novel.type = re.findall(
             '<div>类别:<a href="/wapsort/[0-9]*?_[0-9]*?\.html" title=".*?">(.*?)</a></div>',
             result).pop()
         page = BASE_URL + re.findall(
             '<a href="(/novel/[0-9]*?/[0-9]*?\.html)" title="' + PAGE +
             '">1</a>', result).pop()
         max_page = int(
             re.findall(
                 '<span style="color:#666;font-size:11px;line-height: 22px;">共([0-9]*?)章节</span>',
                 result).pop())
         # print(page)
         result = self._opener.open(page).read().decode()
         with open('novels/%d_%s.txt' % (novel.id, novel.title), 'w') as f:
             try:
                 for i in range(1, max_page + 1):
                     content = re.findall(
                         '<div id="nr1" style="font-size:18px;">([\s\S]*?)</div>',
                         result).pop().replace(
                             '</p>\r\n<p></p>',
                             '').replace('<p>',
                                         '').replace('</p>', '').replace(
                                             '&nbsp;', ' ')
                     # print(content)
                     print('%s->%d/%d' % (novel.title, i, max_page))
                     f.write(content)
                     if i == max_page:
                         break
                     page = re.findall(
                         '<td class="next"><a id="pt_next" href="(' +
                         BASE_URL +
                         '/novel/[0-9]*?/[0-9]*?\.html)">下一章</a></td>',
                         result).pop()
                     # print(page)
                     result = self._opener.open(page).read().decode()
                     time.sleep(1)
                 self._db.insert_novel(novel)
                 print(novel.title + ' Done')
                 self.log_file.write(novel.title + " Done\n")
                 return
             except Exception as e:
                 print(e)
         retry_time += 1
     print("!!!Fail to save %s" % novel.title)
     self.log_file.write("!!!Fail to save %s\n" % novel.title)
Exemplo n.º 2
0
 def start(self):
     result = self._opener.open(BASE_URL +
                                '/wapsort/11_1.html').read().decode()
     max_page = int(
         re.findall(
             '<input style="width: 50%;" type="number" name="page" value="" id="go_page" min="1" max="([0-9]*?)" />',
             result).pop())
     # max_page = 5
     for i in range(1, max_page):
         result = self._opener.open(BASE_URL + '/wapsort/11_%d.html' %
                                    i).read().decode()
         ids = re.findall(
             '<h3><a href="' + BASE_URL +
             '/novel/([0-9]*)\.html">.*?</a></h3>', result)
         titles = re.findall(
             '<h3><a href="' + BASE_URL +
             '/novel/[0-9]*\.html">(.*?)</a></h3>', result)
         urls = re.findall(
             '<h3><a href="(' + BASE_URL + '/novel/[0-9]*\.html)">', result)
         authors = re.findall('<p>作者:<strong>(.*?)</strong></p>', result)
         descriptions = re.findall(
             '<span class="abstract"><a href="' + BASE_URL +
             '/novel/[0-9]*\.html">([\s\S]*?)</a></span>', result)
         for j in range(0, titles.__len__()):
             novel = Novel(titles[j],
                           authors[j],
                           descriptions[j],
                           id=int(ids[j]))
             if self._db.check_novel_exists(novel.id):
                 print("Skip existing: %s" % novel.title)
                 continue
             self._pool.submit(self.process_novel, self, novel, urls[j])
def html_parse(html, fileHandle):
    """fileHandle is a work in progress feature of creating a reading list offline"""
    reading_list = []
    soup = BeautifulSoup(html, 'html.parser')
    print("\t Parsing Reading List")
    print(" ====================================")
    title_names = soup.find_all("tr", attrs={"class": "rl_links"})
    chapters = soup.find_all("a", attrs={"class": "chp-release"})
    counter = 0
    if (len(chapters) % 2) != 0:
        print("Error: Incorrect number of chapters")
        sys.exit()
    if fileHandle:
        for i in range(len(title_names)):
            title = title_names[i].attrs["data-title"]
            current_chapter = chapters[i + counter].get_text()
            latest_chapter = chapters[i + counter + 1].get_text()
            counter += 1
            temp = Novel(title, latest_chapter, current_chapter)
            reading_list.append(temp)
        return (reading_list)
    else:
        saveFile = open("readingList.txt", "w")
        counter = 0
        for i in range(len(title_names)):
            saveFile.write(title_names[i].attrs["data-title"] + ",")
            saveFile.write(chapters[i + counter].get_text() + ",")
            saveFile.write(chapters[i + counter + 1].get_text() + "\n")
            counter += 1
        print("The save file,'readingList.txt' has been created...")
Exemplo n.º 4
0
def search_novel():
    form = request.form
    text = form.get('content')
    novel_list = search(text)
    n_list = []
    for i in range(len(novel_list[0])):
        n = Novel(novel_list[0][i], novel_list[1][i],novel_list[2][i],novel_list[3][i],novel_list[4][i], novel_list[5][i])
        n_list.append(n)
    return render_template('search_result.html',text=text, novel=n_list)
Exemplo n.º 5
0
def grab_volume(url, output_dir, cover_path, out_format):
    """
    grab volume
    
    Args:
        url: A string represent the url which was input by user
        output_dir: A string represent the path of the output EPUB file
        cover_file: A string represent the path of the EPUB cover
        out_format: A string represent the output format
    """
    try:
        print('Getting:' + url)
        novel = Novel(url=url, single_thread=_SINGLE_THREAD)
        novel.get_novel_information()
        epub = Epub(output_dir=output_dir, cover_path=cover_path, out_format=out_format, **novel.novel_information())
        epub.generate_file()

    except Exception as e:
        print('错误', str(e) + '\nAt:' + url)
        raise e
Exemplo n.º 6
0
def scrape():
    try:
        url = request.json["url"]
    except KeyError:
        return "Missing url in request", 400

    if DEBUG:
        d = f"{DATA}/{time()}"
        os.mkdir(d)
        fname = Novel(url).collect(d)
        return send_file(os.path.join(d, fname))
    else:
        try:
            d = f"{DATA}/{time()}"
            os.mkdir(d)
            fname = Novel(url).collect(d)
            return send_file(os.path.join(d, fname))
        except Exception as e:
            print(e)
            return "Failed to scrape book", 400
Exemplo n.º 7
0
def grab_volume(url, output_dir, cover_path):
    """
    grab volume
    
    Args:
        url: A string represent the url which was input by user
        output_dir: A string represent the path of the output EPUB file
        cover_file: A string represent the path of the EPUB cover
    """
    try:
        print_info('Getting:' + url)
        novel = Novel(url=url, single_thread=SINGLE_THREAD)
        novel.get_novel_information()
        epub = Epub(output_dir=output_dir, cover_path=cover_path, **novel.novel_information())
        epub.generate_epub()

    except Exception as e:
        if HAS_QT:
            SENDER.sigWarningMessage.emit('错误', str(e) + '\nat:' + url)
            SENDER.sigButton.emit()
        print(url)
        raise e
def fileReader():
    if os.path.isfile("readingList.txt") == False:
        print("Error: Save file, 'readingList.txt' is missing")
        sys.exit()
    readingList = []
    saveFile = open("readingList.txt", "r")
    line = saveFile.readline()
    while (len(line) != 0):
        data = line.split(",")
        newNovel = Novel(data[0], data[1], data[2][0:len(data[2]) - 1])
        readingList.append(newNovel)
        line = saveFile.readline()
    return readingList
Exemplo n.º 9
0
def grab_volume(url, output_dir, cover_path):
    """
    grab volume
    
    Args:
        url: A string represent the url which was input by user
        output_dir: A string represent the path of the output EPUB file
        cover_file: A string represent the path of the EPUB cover
    """
    try:
        print_info('Getting:' + url)
        novel = Novel(url=url, single_thread=SINGLE_THREAD)
        novel.get_novel_information()
        epub = Epub(output_dir=output_dir,
                    cover_path=cover_path,
                    **novel.novel_information())
        epub.generate_epub()

    except Exception as e:
        if HAS_QT:
            SENDER.sigWarningMessage.emit('错误', str(e) + '\nat:' + url)
            SENDER.sigButton.emit()
        print(url)
        raise e
Exemplo n.º 10
0
 def __init__(self, bookid):
     Novel.__init__(self, bookid)
     self.urlheader = 'https://www.69shu.io'
     self.blackliststrline = ['一秒记住【69书吧www.69shu.io】,更新快,无弹窗,免费读!']
     self.blackliststr = []
Exemplo n.º 11
0
 def __init__(self, bookid):
     Novel.__init__(self, bookid)
     self.urlheader = 'https://www.biqudu.net'
     self.blackliststrline = ['readx();', 'chaptererror();']
     self.blackliststr = []
Exemplo n.º 12
0
from novel import Novel
from printer import Printer
from librarian import Librarian

novel = Novel("1984", "George Orwell")

librarian = Librarian()
librarian.setNovel(novel)

printer = Printer()
printer.setNovel(novel)

# Uppdatera årtalet
novel.setYear(1949)

#uppdatera innehållet
novel.setContent(
    "Freedom is the freedom to say that two plus two make four. If that is granted, all else follows."
)
Exemplo n.º 13
0
from novel import Novel

# 分析三体
novel = Novel("三体.txt", "三体主要人物.txt")
novel.draw_picture("三体主要人物名字出现次数")
Exemplo n.º 14
0
    def analyze_text(self, book_folder, out_folder):
        filename = self.input_file.split('.')[0]
        result_book_folder = out_folder + filename + "/"
        if not os.path.exists(result_book_folder):
            os.makedirs(os.path.dirname(result_book_folder))

        novel = Novel(book_folder + self.input_file)
        novel.read()
        novel.parse_persons()
        novel.find_persons_title()
        novel.store(filename=result_book_folder + self.all_names,
                    data=novel.persons)
        # if you do not remove single occurrences, eps behaviour will be unstable
        occurrence_limit = 2
        novel.remove_less_than(occurrences=occurrence_limit)
        novel.store(filename=result_book_folder + filename +
                    "_names_more_than_" + str(occurrence_limit) + ".csv",
                    data=novel.persons)
        novel.cluster_aliases()
        novel.associate_simple_single_names()
        novel.associate_single_names()
        novel.store(filename=result_book_folder + self.clusters,
                    data=novel.cluster_repetitions)
        novel.create_cluster_repetitions_df()
        novel.cluster_repetitions_df.to_pickle(result_book_folder + filename +
                                               '.pkl')
        novel.dealiases()
        novel.store(filename=result_book_folder + filename + "_dealiased.txt",
                    data=novel.dealiased_text,
                    type='txt')
        #Do the coreference after the dealias, because sometimes the coreference write a name just after a separation
        # and this lead to some not desired wrong situations in which name are together (e.g. "Potter,Hermione")
        novel.coreference()
        novel.store(filename=result_book_folder + self.output_file,
                    data=novel.dealiased_text,
                    type='txt')
        self.novel = novel
        return novel.cluster_repetitions_df
Exemplo n.º 15
0
    def analyze_text(self, book_folder, out_folder):
        filename = self.input_file.split('.')[0]
        result_book_folder = out_folder + filename + "/"
        if not os.path.exists(result_book_folder):
            os.makedirs(os.path.dirname(result_book_folder))

        novel = Novel(book_folder + self.input_file)
        novel.read()
        novel.parse_persons()
        novel.store(filename=result_book_folder + self.all_names,
                    data=novel.persons)
        # if you do not remove single occurrences, eps behaviour will be unstable
        occurrence_limit = 2
        novel.remove_less_than(occurrences=occurrence_limit)
        novel.store(filename=result_book_folder + filename +
                    "_names_more_than_" + str(occurrence_limit) + ".csv",
                    data=novel.persons)
        novel.cluster_aliases()
        novel.associate_single_names()
        novel.store(filename=result_book_folder + self.clusters,
                    data=novel.cluster_repetitions)
        novel.dealiases()
        novel.store(filename=result_book_folder + self.output_file,
                    data=novel.dealiased_text,
                    type='txt')
        self.novel = novel
Exemplo n.º 16
0
from novel import Novel

if __name__ == '__main__':
    Novel.load_novels()
    c = Novel.get_novel('Archfiend')
    assert isinstance(c, Novel)
    # c.add_chosen_book("13")
    c.add_chosen_book("1")
    # c.add_chosen_book("15")
    # c.add_chosen_book("16")
    # c.add_chosen_book("17")
    # c.add_chosen_book("18")
    # c.add_chosen_book("19")
    c.process()