def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## contentElements = soup.select("p") contentElements[0].decompose() contentElements[1].decompose() contentElements[-1].decompose() contentElements[-2].decompose() contentElements[-3].decompose() return Chapter(title=None, content=Stringify(soup))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup] ) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Locate relevant page elements. titleElement = soup.select_one("div#chapter-outer > div.caption > div > h4") # No error-checking here. Not sure if every chapter has to have a title on WW. contentElement = soup.select_one("div#chapter-content") if not contentElement: logging.error("Content element not found.") return False # Return. return Chapter( titleElement.get_text().strip() if titleElement else "", Stringify(contentElement.encode_contents()) )
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Extract the title. title = None titleElement = soup.select_one("p.highlighted-image__title > a") if titleElement: title = titleElement.get_text().strip() # Extract the content. contentElement = soup.select_one("div.storytext-container") if not contentElement: logging.error("Could find the content element.") return None # Return. return Chapter(title=title, content=Stringify(contentElement.encode_contents()))
def ExtractChapter(self, index: int) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param index The index of the chapter to be extracted. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## if 1 == self.Story.Metadata.ChapterCount: titleElement = None contentElement = self._storySoup.select_one( "div#chapters div.userstuff") if not contentElement: logging.error("Content element not found.") return None if (landmarkElement := contentElement.select_one("h3#work")): landmarkElement.decompose() return Chapter(title=titleElement.get_text().strip() if titleElement else None, content=Stringify(contentElement.encode_contents()))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup] ) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Locate relevant page elements. titleElement = soup.select_one("h2#quizSubtitle") if not titleElement: logging.error("Title element not found.") return False contentElement = soup.select_one("#rescontent") if not contentElement: logging.error("Content element not found.") return False # Return. return Chapter( titleElement.get_text().strip(), Stringify(contentElement.encode_contents()) )
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Extract the content. contentElement = soup.select_one("div#story") if not contentElement: logging.error("Couldn't find the content element.") return None # Return. return Chapter(title=self._chapterTitles[URL] if (URL in self._chapterTitles) else None, content=Stringify(contentElement.encode_contents()))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Read the chapter. chapterText = self._webSession.Get(URL, textEncoding="ascii") if not chapterText: logging.error("Failed to download a chapter.") return False chapterText = chapterText.splitlines() if len(chapterText) < 4: logging.error("Invalid chapter format.") return False chapterText = chapterText[3:] # Format the content. chapterCode = "" currentParagraphCode = "" for line in chapterText: if not line: chapterCode += f"<p>{currentParagraphCode}</p>" currentParagraphCode = "" else: currentParagraphCode += f" {line.strip()}" # Return. return Chapter(content=chapterCode)
def ExtractChapter(self, index: int) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param index The index of the chapter to be extracted. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## if (not self._chapters) or (index < 1): return None return Chapter(content="".join(self._chapters[index - 1]))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## rowElements = soup.select("div#contentdata > table > tr") if (not rowElements) or len(rowElements) < 3: logging.error("Chapter page doesn't conform to expected format.") return Chapter(title=None, content=Stringify(rowElements[2].encode_contents()))
def _ProcessURL(self, URL: str) -> Optional[Story]: ## # # Processes a URL, in text mode. # # @param URL The URL to be processed. # # @return The Story object if the URL has been processed successfully, **None** otherwise. # ## # Locate a working extractor. self._interface.Process("Creating the extractor...", section=True) extractor = CreateExtractor(URL) if not extractor: logging.error("No matching extractor found.") return None self._interface.Comment( f'Extractor created: "{type(extractor).__name__}".') # Authenticate the user (if supported by the extractor). if self._arguments.Authenticate and extractor.SupportsAuthentication(): self._interface.Process("Logging-in...", section=True) authenticationResult = extractor.Authenticate(self._interface) if Extractor.AuthenticationResult.FAILURE == authenticationResult: self._interface.Error("Failed to authenticate.") elif Extractor.AuthenticationResult.ABANDONED == authenticationResult: self._interface.Comment("Proceeding without logging-in...") else: self._interface.Comment("Authenticated successfully.") # Scan the story. self._interface.Process("Scanning the story...", section=True) if not extractor.ScanStory(): logging.error("Failed to scan the story.") return None self._PrintMetadata(extractor.Story) # Check whether the output files already exist. outputFilePaths = self._GetOutputPaths(self._arguments.Output, extractor.Story) if (not self._arguments.Force) and all( x.is_file() for x in outputFilePaths.values()): self._interface.Comment("This story has been downloaded already.", section=True) return True elif self._arguments.Force: [x.unlink() for x in outputFilePaths.values() if x.is_file()] # Extract content. self._interface.Process("Extracting content...", section=True) for index in range(1, extractor.Story.Metadata.ChapterCount + 1): # Generate cache identifiers. cacheOwnerName = extractor.Story.Metadata.URL cacheTitleName = f"{index}-Title" cacheContentName = f"{index}-Content" # Retrieve chapter data, either from cache or by downloading it. retrievedFromCache = False chapter = Chapter(title=Stringify( self._cache.RetrieveItem(cacheOwnerName, cacheTitleName)), content=Stringify( self._cache.RetrieveItem( cacheOwnerName, cacheContentName))) if chapter: retrievedFromCache = True else: chapter = extractor.ExtractChapter(index) if not chapter: if (1 != index) and (extractor.Story.Metadata.ChapterCount != index): logging.error("Failed to extract story content.") return None else: self._interface.Error( "Failed to extract the last chapter - it doesn't seem to exist." ) continue extractor.Story.Chapters.append(chapter) # Add the chapter to cache. if not retrievedFromCache: self._cache.AddItem(cacheOwnerName, cacheTitleName, chapter.Title) self._cache.AddItem(cacheOwnerName, cacheContentName, chapter.Content) # Notify the user, then sleep for a while. self._interface.ProgressBar( index, extractor.Story.Metadata.ChapterCount, Configuration.ProgressBarLength, f"# Extracted chapter {index}/{extractor.Story.Metadata.ChapterCount}", True) if extractor.Story.Metadata.ChapterCount == index: self._interface.EmptyLine() if not retrievedFromCache and extractor.RequiresBreaksBetweenRequests( ): sleep(Configuration.PostChapterSleepTime) # Locate and download images. if self._arguments.Images: self._interface.Process("Downloading images...", section=True) # Locate the images. for chapter in extractor.Story.Chapters: extractor.Story.Images.extend(FindImagesInCode( chapter.Content)) storySiteURL = GetSiteURL(extractor.Story.Metadata.URL) for image in extractor.Story.Images: image.URL = MakeURLAbsolute(image.URL, storySiteURL) self._interface.Comment( f"Found {len(extractor.Story.Images)} image(s).") # Download them. if extractor.Story.Images: imageCount = len(extractor.Story.Images) downloadedImageCount = 0 previousImageFailedToDownload = False for index, image in enumerate(extractor.Story.Images, start=1): retrievedFromCache = False imageData = self._cache.RetrieveItem( extractor.Story.Metadata.URL, image.URL) if not image.CreateFromData( imageData, Configuration.MaximumImageSideLength): imageData = extractor.ExtractMedia(image.URL) if imageData: image.CreateFromData( imageData, Configuration.MaximumImageSideLength) else: retrievedFromCache = True if image: if not retrievedFromCache: self._cache.AddItem(extractor.Story.Metadata.URL, image.URL, image.Data) self._interface.ProgressBar( index, imageCount, Configuration.ProgressBarLength, f"# Downloaded image {index}/{imageCount}", True) if imageCount == index: print() downloadedImageCount += 1 previousImageFailedToDownload = False else: if (index > 1) and (not previousImageFailedToDownload): print() errorMessage = \ f'Failed to download image {index}/{imageCount}: "{image.URL}".' \ if not imageData else \ f'Failed to process/re-encode image {index}/{imageCount}: "{image.URL}".' self._interface.Error(errorMessage) previousImageFailedToDownload = True self._interface.Comment( f"Successfully downloaded {downloadedImageCount}/{imageCount} image(s)." ) # Process content. self._interface.Process("Processing content...", section=True) extractor.Story.Process() for index, chapter in enumerate(extractor.Story.Chapters, start=1): # Store original content. if self._arguments.Debug: fileName = GetSanitizedFileName(f"{index} - Original.html") fileSubdirectoryName = GetSanitizedFileName( extractor.Story.Metadata.Title) WriteTextFile( Configuration.DebugDirectoryPath / fileSubdirectoryName / fileName, chapter.Content) # The sanitizer is used twice - once before any other processing, once after every other # processor. The first time is required to clean up the story (remove empty tags and tag # trees, for example), the second to guarantee that the story is actually sanitized. chapter.Content = SanitizerProcessor().Process(chapter.Content) chapter.Content = TypographyProcessor().Process(chapter.Content) chapter.Content = SanitizerProcessor().Process(chapter.Content) # Store processed content. if self._arguments.Debug: fileName = GetSanitizedFileName(f"{index} - Processed.html") fileSubdirectoryName = GetSanitizedFileName( extractor.Story.Metadata.Title) WriteTextFile( Configuration.DebugDirectoryPath / fileSubdirectoryName / fileName, chapter.Content) if not extractor.Story.Metadata.WordCount: extractor.Story.Metadata.WordCount = extractor.Story.CalculateWordCount( ) self._interface.Comment("Content processed.") # Return. return extractor.Story
logging.error("Content element not found.") return None if (unwantedElement := contentElement.select_one("span.rt-reading-time")): unwantedElement.replaceWith("") if (unwantedElement := contentElement.select_one("div.wpcm-subscribe")): unwantedElement.replaceWith("") if (unwantedElement := contentElement.select_one("rating-form")): unwantedElement.replaceWith("") return Chapter(title=SeparateSubtitle( self._CleanStoryTitle(titleElement.get_text().strip())), content=Stringify(contentElement.encode_contents())) def _FindAllStoriesByAuthor(self, authorName: str): # Download author's page. authorsPageURL = f"https://najlepszaerotyka.com.pl/author/{authorName}/" soup = self._webSession.GetSoup(authorsPageURL) if not soup: logging.error("Failed to download page: \"{authorsPageURL\".") return None # Get the number of subpages.
logging.error(f'Failed to download page: "{pageURL}".') return None contentElement = soup.select_one( "div.b-story-body-x > div") or soup.select_one( "div.panel.article") if not contentElement: logging.error("Story content element not found.") return None content += "<br/><br/>" + Stringify( contentElement.encode_contents()) # Return. return Chapter(title=None, content=content) def _GetNormalizedStoryURL(self, URL: str) -> str: ## # # Returns a normalized story URL, i.e. one that can be used for anything. # # @param URL Input URL (given by the user). # # @return Normalized URL. # ## if not URL: return URL
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup] ) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Define usual story endings. USUAL_ENDINGS = [ "~~~", "~ ~ ~ ", "the end", "end", ] # Locate and cut the end. text = self._storyText.splitlines() separatorLineIndices = [] endLineIndices = [] for index, line in enumerate(text): strippedLine = line.strip() if strippedLine.startswith("***") or strippedLine.startswith("------"): separatorLineIndices.append(index) lowercaseLine = strippedLine.lower() for ending in USUAL_ENDINGS: if lowercaseLine.startswith(ending): endLineIndices.append(index) continue firstLineIndex = separatorLineIndices[1] if separatorLineIndices else -1 lastLineIndex = endLineIndices[-1] if endLineIndices else -1 if -1 == firstLineIndex: logging.error("Invalid story content format.") return None if -1 == lastLineIndex: text = text[firstLineIndex + 1:] else: text = text[firstLineIndex + 1:lastLineIndex] # Format the content. chapterCode = "" currentParagraphCode = "" for line in text: if not line: chapterCode += f"<p>{currentParagraphCode}</p>" currentParagraphCode = "" else: currentParagraphCode += f" {line.strip()}" # Return. return Chapter(content = chapterCode)
f"Trying to extract chapter {index}. " f"Only {len(chapterElements)} chapter(s) located. " f"The story supposedly has {self.Story.Metadata.ChapterCount} chapter(s)." ) return None currentChapterElement = chapterElements[index - 1] titleElement = currentChapterElement.select_one("h3.title") contentElement = currentChapterElement.select_one("div.userstuff") if (landmarkElement := contentElement.select_one("h3#work")): landmarkElement.decompose() return Chapter(title=titleElement.get_text().strip() if titleElement else None, content=Stringify(contentElement.encode_contents())) def _ScanWorks(self, URL: str) -> Optional[List[str]]: ## # # Scans a list of works: generates the list of story URLs. # # @param URL The URL. # # @return **None** when the scan fails, a list of story URLs when it doesn't fail. # ## # Check the arguments.
if not contentElement: logging.error("Content element not found.") return False if (element := contentElement.select_one("div#storyHeader")): element.decompose() if (element := contentElement.select_one("div#authorNotes")): element.decompose() for element in contentElement.select("form"): element.decompose() # Return. return Chapter(content=Stringify(contentElement.encode_contents())) @staticmethod def _GetStoryID(URL: str) -> Optional[str]: ## # # Retrieves story ID from story URL. # # @param URL The URL of the story. # # @return The ID of the story. Optionally **None**. # ## if not URL:
if (selectedChapterElement := soup.find("option", {"selected": True})): title = selectedChapterElement.text.strip() if title and (titleMatch := re.search("\d+\. (.*)", title)): title = titleMatch.group(1) # Read the content. storyTextElement = soup.find(id="storytext") if not storyTextElement: logging.error("Story text element not found.") return None # Create the Chapter and return it. return Chapter(title=title, content=Stringify(storyTextElement.encode_contents())) @staticmethod def _GetStoryID(URL: str) -> Optional[str]: if not URL: return None storyIDMatch = re.search("/s/(\d+)/", URL) if not storyIDMatch: return None return storyIDMatch.group(1) @staticmethod def _ReformatDate(date: str) -> Optional[str]:
class ExtractorHentaiFoundry(Extractor): def __init__(self) -> None: ## # # The constructor. # ## super().__init__() def GetSupportedHostnames(self) -> List[str]: ## # # Returns a list of hostnames supposed to be supported by the extractor. # # @return A list of supported hostnames. # ## return ["hentai-foundry.com"] def ScanChannel(self, URL: str) -> Optional[List[str]]: ## # # Scans the channel: generates the list of story URLs. # # @return **None** when the scan fails, a list of story URLs when it doesn't fail. # ## if (not URL) or (GetHostname(URL) not in self.GetSupportedHostnames()): return None usernameStoryIDMatch = re.search("/user/([a-zA-Z0-9_]+)/(\d+)", URL) if usernameStoryIDMatch: return None usernameMatch = re.search("/user/([a-zA-Z0-9_]+)", URL) if not usernameMatch: return None username = usernameMatch.group(1) normalizedURL = f"http://www.hentai-foundry.com/stories/user/{username}/" pageSoup = self._webSession.GetSoup(self._GetAdultView(normalizedURL)) if not pageSoup: return None pageCountDescriptionElement = pageSoup.select_one( ".galleryHeader > .summary") pageCountDescription = pageCountDescriptionElement.get_text().strip() pageCountDescriptionMatch = re.search( "Displaying (\d+)-(\d+) of (\d+) results", pageCountDescription) if not pageCountDescriptionMatch: logging.error("Failed to retrieve page count of the Stories tab.") return None storiesPerPage = int(pageCountDescriptionMatch.group(2)) storiesInTotal = int(pageCountDescriptionMatch.group(3)) if not storiesPerPage: return None pageCount = ceil(storiesInTotal / storiesPerPage) storyURLs = [] for pageIndex in range(1, pageCount + 1): pageURL = self._GetAdultView( f"http://www.hentai-foundry.com/stories/user/{username}?page={pageIndex}" ) pageSoup = self._webSession.GetSoup(pageURL) if not pageSoup: return None storyLinkElements = pageSoup.select( ".items > .storyRow > .titlebar > a") for linkElement in storyLinkElements: if not linkElement.has_attr("href"): continue storyURLs.append(self._baseURL + linkElement["href"]) return storyURLs def _InternallyScanStory(self, URL: str, soup: Optional[BeautifulSoup]) -> bool: ## # # Scans the story: generates the list of chapter URLs and retrieves the # metadata. # # @param URL The URL of the story. # @param soup The tag soup. # # @return **False** when the scan fails, **True** when it doesn't fail. # ## # Locate metadata. titleElement = soup.select_one(".titlebar a") if not titleElement: logging.error("Title element not found.") return False authorElement = soup.select_one(".storyInfo > .col1 > a") if not authorElement: logging.error("Author element not found.") return False datesElements = soup.select(".storyInfo > .col2 > .indent") if (not datesElements) or (len(datesElements) < 2): logging.error("Dates elements not found.") return False datePublishedElement = datesElements[0] dateUpdatedElement = datesElements[1] summaryElement = soup.select_one(".storyDescript") if not summaryElement: logging.error("Summary element not found.") return False chapterCountWordCountElement = soup.select_one(".storyInfo > .col3") if not chapterCountWordCountElement: logging.error("Chapter/word count elements not found.") return False # Extract and save metadata. self.Story.Metadata.Title = titleElement.get_text().strip() self.Story.Metadata.Author = authorElement.get_text().strip() rawDatePublished = datePublishedElement.get_text().strip() rawDateUpdated = dateUpdatedElement.get_text().strip() self.Story.Metadata.DatePublished = self._ReformatDate( rawDatePublished) self.Story.Metadata.DateUpdated = self._ReformatDate(rawDateUpdated) chapterCountWordCountDescription = StripHTML( chapterCountWordCountElement.get_text().strip()) chapterCountMatch = re.search("Chapters:\s+(\d+)", chapterCountWordCountDescription) if not chapterCountMatch: logging.error("Chapter count not found.") return False wordCountMatch = re.search("Words:\s+([0-9,]+)", chapterCountWordCountDescription) if not wordCountMatch: logging.error("Word count not found.") return False self.Story.Metadata.ChapterCount = int(chapterCountMatch.group(1)) self.Story.Metadata.WordCount = self._ReadWordCount( wordCountMatch.group(1)) self.Story.Metadata.Summary = StripHTML( summaryElement.get_text().strip()) # Retrieve chapter URLs. chapterLinkElements = soup.select(".boxbody > p > a") if not chapterLinkElements: logging.error("No chapter links found.") return False for linkElement in chapterLinkElements: if not linkElement.has_attr("href"): continue self._chapterURLs.append(self._baseURL + linkElement["href"]) # Return. return True def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Read the title. chapterTitle = None if (titleElement := soup.select_one("#viewChapter > .boxheader")): chapterTitle = titleElement.get_text().strip() # Read the content. storyTextElement = soup.select_one("#viewChapter > .boxbody") if not storyTextElement: logging.error("Story text element not found.") return None return Chapter(title=chapterTitle, content=Stringify(storyTextElement.encode_contents()))