예제 #1
0
    def fetch_metadata(self):
        content = http.get(self.base_url)
        doc = BeautifulSoup(content, "html.parser")

        fic_id = self.extract_url('id')

        profile_top = doc.select_one('div#profile_top')

        title = profile_top.select_one('b.xcontrast_txt').string.strip()
        author = profile_top.select_one('a.xcontrast_txt').string.strip()
        summary = profile_top.select_one('div.xcontrast_txt').string.strip()

        meta_txt = profile_top.select_one('span.xgray').get_text()
        rating, language, categories, characters, chapters, wordcount, *rest = [
            item.strip() for item in meta_txt.split('-')
        ]

        return Fanfic(thread_id=fic_id,
                      thread_type='fanfiction.net',
                      title=title,
                      author=author,
                      words=parse_wordcount(wordcount),
                      tags=categories,
                      language=language,
                      status='Complete' if 'Complete' in rest else None,
                      summary=summary,
                      thread_url=self.base_url)
예제 #2
0
    def fetch_threadmarks(self, fanfic):
        content = http.get(self.base_url)
        doc = BeautifulSoup(content, 'html5lib')

        chapters = doc.select_one('#chap_select').select('option')
        return [
            Threadmark(post_id=int(el['value']),
                       fanfic_id=fanfic.id,
                       title=el.string.strip(),
                       words=0) for el in chapters
        ]
예제 #3
0
    def fetch_chapters(self, fanfic):
        threadmark_count = len(fanfic.threadmarks)
        for chapter_num in range(1, threadmark_count + 1):
            chapter_url = "%s%d/" % (self.base_url, chapter_num)
            print("Fetch Chapter ...", chapter_num, chapter_url)
            content = http.get(chapter_url)
            doc = BeautifulSoup(content, "html5lib")
            story_text = doc.select_one('div.storytext')
            thread.store_post(fanfic.thread_key, chapter_num,
                              story_text.prettify())

            time.sleep(1)
예제 #4
0
    def fetch_metadata(self):
        content = http.get(self.base_url)
        doc = BeautifulSoup(content, "html.parser")

        domain, thread_id = self.extract_url('domain', 'id')

        title = doc.find('h1').text.strip()
        author = doc.select_one('p#pageDescription a.username').string.strip()

        return Fanfic(thread_id=thread_id,
                      thread_type="xenforo.%s" % domain,
                      title=title,
                      author=author,
                      thread_url=self.base_url)
예제 #5
0
    def fetch_chapters(self, fanfic):
        threadmark_count = len(fanfic.threadmarks)
        reader_url = "%s/reader" % self.base_url
        page_count = int(math.ceil(threadmark_count / 10))

        for page in range(1, page_count + 1):
            print("Fetch Page ...", page)
            content = http.get(reader_url, params={'page': page})
            doc = BeautifulSoup(content, "html.parser")
            post_list = doc.select('li.message.hasThreadmark')
            for post_el in post_list:
                post_id = post_el['id'].split('-')[1]
                post_content = post_el.find('article')
                post_content.blockquote.unwrap()

                thread.store_post(fanfic.thread_key, post_id,
                                  post_content.prettify())

            time.sleep(1)
예제 #6
0
    def fetch_threadmarks(self, fanfic):
        threadmarks_url = "%sthreadmarks" % self.base_url

        content = http.get(threadmarks_url)
        doc = BeautifulSoup(content, "html.parser")

        threadmark_list = doc.select(
            'div.threadmarkList ol li.threadmarkListItem')
        threadmarks = [
            Threadmark(post_id=int(el.a['data-previewurl'].split('/')[1]),
                       fanfic_id=fanfic.id,
                       title=el.a.string.strip(),
                       words=int(el['data-words']),
                       likes=int(el['data-likes']),
                       author=el['data-content-author'],
                       published=datetime.fromtimestamp(
                           int(el['data-content-date'])))
            for el in threadmark_list
        ]

        return threadmarks