def fetch_metadata(self): content = http.get(self.base_url) doc = BeautifulSoup(content, "html.parser") fic_id = self.extract_url('id') profile_top = doc.select_one('div#profile_top') title = profile_top.select_one('b.xcontrast_txt').string.strip() author = profile_top.select_one('a.xcontrast_txt').string.strip() summary = profile_top.select_one('div.xcontrast_txt').string.strip() meta_txt = profile_top.select_one('span.xgray').get_text() rating, language, categories, characters, chapters, wordcount, *rest = [ item.strip() for item in meta_txt.split('-') ] return Fanfic(thread_id=fic_id, thread_type='fanfiction.net', title=title, author=author, words=parse_wordcount(wordcount), tags=categories, language=language, status='Complete' if 'Complete' in rest else None, summary=summary, thread_url=self.base_url)
def fetch_threadmarks(self, fanfic): content = http.get(self.base_url) doc = BeautifulSoup(content, 'html5lib') chapters = doc.select_one('#chap_select').select('option') return [ Threadmark(post_id=int(el['value']), fanfic_id=fanfic.id, title=el.string.strip(), words=0) for el in chapters ]
def fetch_chapters(self, fanfic): threadmark_count = len(fanfic.threadmarks) for chapter_num in range(1, threadmark_count + 1): chapter_url = "%s%d/" % (self.base_url, chapter_num) print("Fetch Chapter ...", chapter_num, chapter_url) content = http.get(chapter_url) doc = BeautifulSoup(content, "html5lib") story_text = doc.select_one('div.storytext') thread.store_post(fanfic.thread_key, chapter_num, story_text.prettify()) time.sleep(1)
def fetch_metadata(self): content = http.get(self.base_url) doc = BeautifulSoup(content, "html.parser") domain, thread_id = self.extract_url('domain', 'id') title = doc.find('h1').text.strip() author = doc.select_one('p#pageDescription a.username').string.strip() return Fanfic(thread_id=thread_id, thread_type="xenforo.%s" % domain, title=title, author=author, thread_url=self.base_url)
def fetch_chapters(self, fanfic): threadmark_count = len(fanfic.threadmarks) reader_url = "%s/reader" % self.base_url page_count = int(math.ceil(threadmark_count / 10)) for page in range(1, page_count + 1): print("Fetch Page ...", page) content = http.get(reader_url, params={'page': page}) doc = BeautifulSoup(content, "html.parser") post_list = doc.select('li.message.hasThreadmark') for post_el in post_list: post_id = post_el['id'].split('-')[1] post_content = post_el.find('article') post_content.blockquote.unwrap() thread.store_post(fanfic.thread_key, post_id, post_content.prettify()) time.sleep(1)
def fetch_threadmarks(self, fanfic): threadmarks_url = "%sthreadmarks" % self.base_url content = http.get(threadmarks_url) doc = BeautifulSoup(content, "html.parser") threadmark_list = doc.select( 'div.threadmarkList ol li.threadmarkListItem') threadmarks = [ Threadmark(post_id=int(el.a['data-previewurl'].split('/')[1]), fanfic_id=fanfic.id, title=el.a.string.strip(), words=int(el['data-words']), likes=int(el['data-likes']), author=el['data-content-author'], published=datetime.fromtimestamp( int(el['data-content-date']))) for el in threadmark_list ] return threadmarks