def importFic(fdata): global ficImportRename ofic = inflateObject(fdata.copy(), ficImportRename) fic = Fic.new() for field in ofic: print('setting "{}" to "{}"'.format(field, ofic[field])) fic.__dict__[field] = ofic[field] fic.published = util.parseDateAsUnix(fic.published, int(time.time())) fic.updated = util.parseDateAsUnix(fic.updated, int(time.time())) print('setting "{}" to "{}"'.format('published', fic.published)) print('setting "{}" to "{}"'.format('updated', fic.updated)) print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId)) fic.insert() for fandom in fdata['fandoms']: print(' adding fandom "{}"'.format(fandom)) fic.add(Fandom.define(fandom)) for character in fdata['characters']: print( ' adding character "{}" from fandom "{}"'.format( character['name'], character['fandom'] ) ) fic.add( Character.define(Fandom.define(character['fandom']), character['name']) ) for genre in fdata['genres']: print(' adding genre "{}"'.format(genre)) fic.add(Genre.define(genre)) for tag in fdata['tags']: print(' adding tag "{}"'.format(tag)) fic.add(Tag.define(tag)) cids = [int(cid) for cid in fdata['chapters']] cids.sort() for cid in cids: print(' adding chapter {}'.format(cid)) ochap = fdata['chapters'][str(cid)] chapter = FicChapter.new() chapter.fic = fic chapter.ficId = fic.id chapter.chapterId = cid for field in ochap: chapter.__dict__[field] = ochap[field] contentPath = './content/{}/{}/{}/content.html'.format( fic.type, fic.localId, cid ) if os.path.isfile(contentPath): html = None with open(contentPath, 'r') as f: html = f.read() print(' has content: {}'.format(len(html))) chapter.setHtml(html) chapter.insert()
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? titleHeadings = soup.findAll('h2', {'class': 'title heading'}) if len(titleHeadings) != 1: raise Exception('unable to find ao3 title {}'.format(fic.url)) fic.title = titleHeadings[0].get_text().strip() summaryModules = soup.findAll('div', {'class': 'summary module'}) if len(summaryModules) != 1: prefaceGroups = soup.findAll('div', {'class': 'preface group'}) if len(prefaceGroups) == 1: summaryModules = prefaceGroups[0].findAll( 'div', {'class': 'summary module'} ) if len(summaryModules) == 1: summaryBq = summaryModules[0].find('blockquote') fic.description = summaryBq.decode_contents(formatter='html').strip() elif fic.description is None: fic.description = "{no summary}" # raise Exception('unable to find ao3 summary {}'.format(fic.localId)) fic.ageRating = '<unkown>' # TODO: error handling cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip() ps = cText.split('/') completedChapters = int(ps[0]) totalChapters = None if ps[1] == '?' else int(ps[1]) fic.chapterCount = completedChapters wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip() fic.wordCount = int(wText) fic.reviewCount = 0 fic.favoriteCount = 0 kDefinition = soup.find('dd', {'class': 'kudos'}) if kDefinition is not None: kText = ' '.join(kDefinition.contents).strip() fic.favoriteCount = int(kText) fic.followCount = 0 pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip() publishedUts = util.parseDateAsUnix(pText, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published if fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.ficStatus = FicStatus.ongoing # TODO chapter/chapters? if totalChapters is None or completedChapters < totalChapters: fic.ficStatus = FicStatus.ongoing statusDt = soup.find('dt', {'class': 'status'}) if statusDt is not None: if statusDt.contents[0] == 'Completed:': fic.ficStatus = FicStatus.complete cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(cText, fic.fetched) fic.updated = OilTimestamp(updatedUts) elif statusDt.contents[0] == 'Updated:': fic.ficStatus = FicStatus.ongoing uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(uText, fic.fetched) fic.updated = OilTimestamp(updatedUts) else: raise Exception('unkown status: {}'.format(statusDt.contents[0])) byline = soup.find('h3', {'class': 'byline heading'}) authorLink = byline.find('a') if authorLink is None: if fic.authorId is not None and len(fic.getAuthorName()) > 0: pass # updated author to anon, don't make changes else: # first loaded after it was already set to anonymous authorUrl = '' author = 'Anonymous' authorId = 'Anonymous' self.setAuthor(fic, author, authorUrl, authorId) else: authorUrl = authorLink.get('href') author = ' '.join(byline.find('a').contents) authorId = author # map pseudo to real? self.setAuthor(fic, author, authorUrl, authorId) if fic.chapterCount > 1: fic.upsert() localChapterIdSelect = soup.find(id='selected_id').findAll('option') # note: ao3 sometimes says there are less chapters than there really # are, possibly due to caching on their end. We just ensure there's _at # least_ chapterCount chapters, then fetch whatever the dropdown tells # us to if len(localChapterIdSelect) > fic.chapterCount: fic.chapterCount = len(localChapterIdSelect) fic.upsert() if len(localChapterIdSelect) != fic.chapterCount: raise Exception('mismatching localChapterId count?') for cid in range(1, fic.chapterCount + 1): chap = fic.chapter(cid) chap.url = '{}{}/chapters/{}?view_adult=true'.format( self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value') ) chap.localChapterId = localChapterIdSelect[cid - 1].get('value') chap.title = localChapterIdSelect[cid - 1].getText().strip() if chap.title is not None: chap.title = util.cleanChapterTitle(chap.title, cid) chap.upsert() fandomDd = soup.find('dd', {'class': 'fandom tags'}) if fandomDd is not None: fandomTags = fandomDd.findAll('a', {'class': 'tag'}) for ft in fandomTags: originalF = ft.contents[0].strip() f = originalF.lower() # TODO: this seriously needs reworked if ( (f.startswith("harry potter ") and f.endswith("rowling")) or f == 'harry potter - fandom' or f == 'fantastic beasts and where to find them (movies)' or f == 'harry potter next generation - fandom' ): fic.add(Fandom.define('Harry Potter')) elif ( f == 'sherlock - fandom' or f == 'sherlock (tv)' or f == 'sherlock holmes & related fandoms' or f == 'sherlock holmes - arthur conan doyle' or f == 'sherlock holmes (downey films)' ): fic.add(Fandom.define('Sherlock Holmes')) elif f == 'furry (fandom)' or f == 'harry - fandom': continue # skip elif f == 'fleurmione - fandom': continue # skip elif f == 'skyfall (2012) - fandom': fic.add(Fandom.define('James Bond')) elif f == 'orphan black (tv)': fic.add(Fandom.define('Orphan Black')) elif ( f == 'naruto' or f == 'naruto shippuden' or f == 'naruto shippuuden - fandom' ): fic.add(Fandom.define('Naruto')) elif f == 'naruto/harry potter': fic.add(Fandom.define('Naruto')) fic.add(Fandom.define('Harry Potter')) elif f == 'bleach': fic.add(Fandom.define('Bleach')) elif ( f == 'iron man (movies)' or f == 'iron man - all media types' or f == 'iron man (comic)' or f == 'iron man - fandom' or f == 'iron man (comics)' ): fic.add(Fandom.define('Iron Man')) elif ( f == 'the avengers (marvel) - all media types' or f == 'the avengers (marvel movies)' or f == 'the avengers - ambiguous fandom' or f == 'the avengers (2012)' or f == 'the avengers' or f == 'avengers (marvel) - all media types' or f == 'marvel avengers movies universe' or f == 'avengers' ): fic.add(Fandom.define('Avengers')) elif f == 'marvel 616': fic.add(Fandom.define('Marvel')) fic.add(Fandom.define('Marvel 616')) elif f == 'thor (movies)' or f == 'thor - all media types': fic.add(Fandom.define('Thor')) elif ( f == 'captain america (movies)' or f == 'captain america - all media types' or f == 'captain america (comics)' ): fic.add(Fandom.define('Captain America')) elif ( f == 'avatar: the last airbender' or f == 'avatar: legend of korra' or f == 'avatar the last airbender - fandom' ): fic.add(Fandom.define('Avatar')) elif f == 'original work': fic.add(Fandom.define('Original Work')) elif f == 'stargate atlantis': fic.add(Fandom.define('Stargate Atlantis')) elif f == 'stargate sg-1': fic.add(Fandom.define('Stargate SG-1')) elif f == 'stargate - all series': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Stargate SG-1')) elif f == 'agents of s.h.i.e.l.d. (tv)': fic.add(Fandom.define('Avengers')) elif f == 'supernatural': fic.add(Fandom.define('Supernatural')) elif f == 'teen wolf (tv)': fic.add(Fandom.define('Teen Wolf')) elif f == 'grimm (tv)': fic.add(Fandom.define('Grimm')) elif ( f == 'the amazing spider-man (movies - webb)' or f == 'spider-man - all media types' or f == 'spider-man: homecoming (2017)' ): fic.add(Fandom.define('Spiderman')) elif ( f == 'x-men - all media types' or f == 'x-men (movieverse)' or f == 'x-men (comicverse)' ): fic.add(Fandom.define('X-Men')) elif ( f == 'lord of the rings - j. r. r. tolkien' or f == 'the lord of the rings - j. r. r. tolkien' ): fic.add(Fandom.define('Lord of the Rings')) elif ( f == 'crisis core: final fantasy vii' or f == 'compilation of final fantasy vii' or f == 'final fantasy vii' ): fic.add(Fandom.define('Final Fantasy VII')) fic.add(Fandom.define('Final Fantasy')) elif f == 'sen to chihiro no kamikakushi | spirited away': fic.add(Fandom.define('Spirited Away')) elif f == 'howl no ugoku shiro | howl\'s moving castle': fic.add(Fandom.define('Howl\'s Moving Castle')) elif f == 'rise of the guardians (2012)': fic.add(Fandom.define('Rise of the Guardians')) elif ( f == 'doctor who' or f == 'doctor who (2005)' or f == 'doctor who & related fandoms' ): fic.add(Fandom.define('Doctor Who')) elif f == 'daredevil (tv)' or f == 'daredevil (comics)': fic.add(Fandom.define('DareDevil')) elif f == 'labyrinth (1986)': fic.add(Fandom.define('Labyrinth')) elif f == 'gravity falls': fic.add(Fandom.define('Gravity Falls')) elif f == 'once upon a time (tv)': fic.add(Fandom.define('Once Upon a Time')) elif f == 'doctor strange (comics)': fic.add(Fandom.define('Doctor Strange')) elif f == 'the sentinel': fic.add(Fandom.define('The Sentinel')) elif f == 'teen titans (animated series)': fic.add(Fandom.define('Teen Titans')) elif ( f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)' or f == 'dc extended universe' or f == 'dc animated universe' ): fic.add(Fandom.define('DC')) elif f == 'vampire hunter d': fic.add(Fandom.define('Vampire Hunter D')) elif f == 'homestuck': fic.add(Fandom.define('Homestuck')) elif f == 'one piece': fic.add(Fandom.define('One Piece')) elif f == 'batman (movies - nolan)': fic.add(Fandom.define('Batman')) elif f == 'die hard (movies)': fic.add(Fandom.define('Die Hard')) elif f == 'discworld - terry pratchett': fic.add(Fandom.define('Discworld')) elif f == 'gossip girl': fic.add(Fandom.define('Gossip Girl')) elif ( f == 'a song of ice and fire - george r. r. martin' or f == 'a song of ice and fire & related fandoms' ): fic.add(Fandom.define('A Song of Ice and Fire')) elif f == 'supergirl (tv 2015)': fic.add(Fandom.define('Supergirl')) elif f == 'merlin (tv)': fic.add(Fandom.define('Merlin')) elif f == 'star trek': fic.add(Fandom.define('Star Trek')) elif f == 'steven universe (cartoon)': fic.add(Fandom.define('Steven Universe')) elif f == 'hellsing': fic.add(Fandom.define('Hellsing')) elif f == 'the breaker': fic.add(Fandom.define('The Breaker')) elif f == 'smallville': fic.add(Fandom.define('Smallville')) elif f == '베리타스 | veritas (manhwa)': fic.add(Fandom.define('Veritas (manhwa)')) elif f == 'guardians of childhood - william joyce': fic.add(Fandom.define('Guardians of Childhood')) elif f == 'person of interest (tv)': fic.add(Fandom.define('Person of Interest')) elif f == 'james bond (craig movies)': fic.add(Fandom.define('James Bond')) elif f == 'the bourne legacy (2012)': fic.add(Fandom.define('Jason Bourne')) elif f == 'numb3rs': fic.add(Fandom.define('Numb3rs')) elif f == 'temeraire - naomi novik': fic.add(Fandom.define('Temeraire')) elif f == 'twilight series - stephenie meyer': fic.add(Fandom.define('Twilight')) elif f == 'dungeons and dragons - fandom': fic.add(Fandom.define('Dungeons and Dragons')) elif f == 'american horror story' or f == 'american horror story: cult': fic.add(Fandom.define('American Horror Story')) elif ( f == 'worm (web serial novel)' or f == 'worm - wildbow' or f == 'parahumans series - wildbow' or f == 'worm (web serial) | wildbow' or f == 'worm - fandom' or f == 'parahumans - fandom' or f == 'worm (parahumans)' or f == 'worm (web serial)' or f == 'worm | parahumans' or f == 'worm (web novel)' ): fic.add(Fandom.define('Worm')) elif f == 'toaru kagaku no railgun | a certain scientific railgun': fic.add(Fandom.define('A Certain Scientific Railgun')) elif f == 'toaru majutsu no index | a certain magical index': fic.add(Fandom.define('A Certain Magical Index')) elif f == 'cthulhu mythos - h. p. lovecraft': fic.add(Fandom.define('Cthulhu')) elif f == 'transformers - all media types': fic.add(Fandom.define('Transformers')) elif f == 'destiny (video game)': fic.add(Fandom.define('Destiny')) elif f == 'fandom - fandom' or f == 'meta - fandom': pass # >_> elif f == 'house m.d.': fic.add(Fandom.define('House, M.D.')) elif f == 'the hobbit (jackson movies)': fic.add(Fandom.define('The Hobbit')) elif f == 'doctor strange (2016)': fic.add(Fandom.define('Doctor Strange')) elif f == 'arrow (tv 2012)': fic.add(Fandom.define('Arrow')) elif f == 'the flash (tv 2014)': fic.add(Fandom.define('Flash')) elif f == 'senki zesshou symphogear': fic.add(Fandom.define('Symphogear')) elif ( f == 'fullmetal alchemist: brotherhood & manga' or f == 'fullmetal alchemist - all media types' or f == 'fullmetal alchemist (anime 2003)' ): fic.add(Fandom.define('Fullmetal Alchemist')) elif ( f == 'star wars - all media types' or f == 'star wars episode vii: the force awakens (2015)' or f == 'star wars prequel trilogy' ): fic.add(Fandom.define('Star Wars')) elif ( f == 'guardians of the galaxy (2014)' or f == 'guardians of the galaxy - all media types' or f == 'guardians of the galaxy (movies)' ): fic.add(Fandom.define('Guardians of the Galaxy')) elif f == 'ant man (2015)' or f == 'ant-man (movies)': fic.add(Fandom.define('Ant Man')) elif f == 'the defenders (marvel tv)': fic.add(Fandom.define('The Defenders')) elif f == 'elementary (tv)': fic.add(Fandom.define('Elementary')) elif f == 'good omens - neil gaiman & terry pratchett': fic.add(Fandom.define('Good Omens')) elif f == 'danny phantom': fic.add(Fandom.define('Danny Phantom')) elif f == 'katekyou hitman reborn!': fic.add(Fandom.define('Katekyo Hitman Reborn!')) elif f == 'welcome to night vale': fic.add(Fandom.define('Welcome to Night Vale')) elif f == 'ncis': fic.add(Fandom.define('NCIS')) elif f == 'torchwood': fic.add(Fandom.define('Torchwood')) elif f == 'magic: the gathering': fic.add(Fandom.define('Magic: The Gathering')) elif f == 'overwatch (video game)': fic.add(Fandom.define('Overwatch')) elif f == 'detroit: become human (video game)': fic.add(Fandom.define('Detroit: Become Human')) elif f == 'greek and roman mythology': pass elif f == 'life is strange (video game)': fic.add(Fandom.define('life is strange (video game)')) elif f == 'akatsuki no yona | yona of the dawn': fic.add(Fandom.define('Yona of the Dawn')) elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia': fic.add(Fandom.define('My Hero Academia')) elif f == 'voltron: legendary defender': fic.add(Fandom.define('Voltron')) elif f == 'selfie (tv)': fic.add(Fandom.define('Selfie')) elif f == 'suits (tv)': fic.add(Fandom.define('Suits')) elif f == 'fruits basket': fic.add(Fandom.define('Fruits Basket')) elif f == 'hetalia: axis powers': fic.add(Fandom.define('Hetalia: Axis Powers')) elif f == 'carmilla (web series)': fic.add(Fandom.define('Carmilla')) elif f == 'the dresden files - jim butcher': fic.add(Fandom.define('Dresden Files')) elif f == 'girl genius': fic.add(Fandom.define('Girl Genius')) elif f == 'unspecified fandom': pass # TODO? elif f == 'nightwing (comics)': fic.add(Fandom.define('Nightwing')) elif f == 'books of the raksura - martha wells': fic.add(Fandom.define('Books of the Raksura')) elif f == 'fall of ile-rien - martha wells': fic.add(Fandom.define('Fall of Ile-Rien')) elif f == 'vorkosigan saga - lois mcmaster bujold': fic.add(Fandom.define('Vorkosigan Saga')) elif ( f == 'highlander: the series' or f == 'highlander - all media types' ): fic.add(Fandom.define('Highlander')) elif f == 'yoroiden samurai troopers | ronin warriors': fic.add(Fandom.define('Ronin Warriors')) elif f == 'hockey rpf': fic.add(Fandom.define('Hockey RPF')) elif f == 'pacific rim (2013)': fic.add(Fandom.define('Pacific Rim')) elif f == 'enchanted forest chronicles - patricia wrede': fic.add(Fandom.define('Enchanted Forest Chronicles')) elif f == 'tortall - tamora pierce': fic.add(Fandom.define('Tortall')) elif f == 'protector of the small - tamora pierce': fic.add(Fandom.define('Protector of the Small')) elif f == 'leverage': fic.add(Fandom.define('Leverage')) elif f == 'valdemar series - mercedes lackey': fic.add(Fandom.define('Valdemar Series')) elif ( f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense' ): fic.add(Fandom.define('B.P.R.D.')) elif f == 'hellboy (comic)': fic.add(Fandom.define('Hellboy')) elif f == 'sga/avatar': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Avatar')) elif f == 'annihilation (2018 garland)': fic.add(Fandom.define('Annihilation')) elif f == 'craft sequence - max gladstone': fic.add(Fandom.define('Craft Sequence')) elif f == 'the good place (tv)': fic.add(Fandom.define('The Good Place')) elif f == 'jessica jones (tv)': fic.add(Fandom.define('Jessica Jones')) elif f == 'mad max series (movies)': fic.add(Fandom.define('Mad Max')) elif f == 'american gods (tv)': fic.add(Fandom.define('American Gods')) elif f == 'terminator: the sarah connor chronicles': fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles')) fic.add(Fandom.define('Terminator')) elif f == 'wolf 359 (radio)': fic.add(Fandom.define('Wolf 359')) elif f == 'shadowrun: dragonfall': fic.add(Fandom.define('Shadowrun')) elif f == 'ars paradoxica (podcast)': fic.add(Fandom.define('Ars Paradoxica')) elif f == 'love is strange - fandom': fic.add(Fandom.define('Love is Strange')) elif f == 'dune - all media types': fic.add(Fandom.define('Dune')) elif f == 'dragon age: origins': fic.add(Fandom.define('Dragon Age: Origins')) elif f == 'game of thrones (tv)': fic.add(Fandom.define('Game of Thrones')) elif f == 'chronicles of amber - roger zelazny': fic.add(Fandom.define('Chronicles of Amber')) elif f == 'the southern reach trilogy - jeff vandermeer': fic.add(Fandom.define('The Southern Reach Trilogy')) elif f == 'continuum (tv)': fic.add(Fandom.define('Continuum')) elif f == 'mage: the ascension': fic.add(Fandom.define('Mage: The Ascension')) elif f == 'the good wife (tv)' or f == 'good wife (tv)': fic.add(Fandom.define('The Good Wife')) elif f == 'alliance-union - c. j. cherryh': fic.add(Fandom.define('Alliance-Union')) elif f == 'indexing - seanan mcguire': fic.add(Fandom.define('Indexing')) elif f == 'ultraviolet (tv)': fic.add(Fandom.define('Ultraviolet')) elif f == 'veronica mars (tv)': fic.add(Fandom.define('Veronica Mars')) elif f == 'secret circle (tv)': fic.add(Fandom.define('Secret Circle')) elif f == 'mahou shoujo madoka magika | puella magi madoka magica': fic.add(Fandom.define('Madoka Magica')) elif f == 'agent carter (tv)': fic.add(Fandom.define('Agent Carter')) elif f == 'dracula & related fandoms': fic.add(Fandom.define('Dracula')) elif f == 'dragon ball': fic.add(Fandom.define('Dragon Ball')) elif f == 'mass effect - all media types': fic.add(Fandom.define('Mass Effect')) elif f == 'firefly' or f == 'serenity (2005)': fic.add(Fandom.define('Firefly')) else: anyHere = False global ao3FandomsMap for fm in ao3FandomsMap: here = False for uf in fm[0]: if f == uf.lower().strip(): here = True break if not here: continue anyHere = True for mf in fm[1]: fic.add(Fandom.define(mf)) if not anyHere: util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}') #raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF)) ourDoms = fic.fandoms() # we have a canonical fandom, try to find our characters if len(ourDoms) == 1: relationshipDd = soup.find('dd', {'class': 'relationship tags'}) if relationshipDd is not None: relationshipTags = relationshipDd.findAll('a', {'class': 'tag'}) for rt in relationshipTags: r = rt.contents[0] chars = r.split('/') if len(chars) > 8: # TODO: sometimes more? raise Exception('unable to parse relationship: {}'.format(r)) for char in chars: fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype)) return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? infoPane = soup.findAll('td', {'class': 'info2_pane'}) if len(infoPane) != 1: raise Exception('unable to find info2_pane: {}'.format(fic.url)) infoPane = infoPane[0] authorHrefPrefix = 'index.php?action=profile&id=' authorLinks = infoPane.findAll('a') authorUrl = None for authorLink in authorLinks: if not authorLink.get('href').startswith(authorHrefPrefix): continue authorUrl = self.baseUrl + '/' + authorLink.get('href') author = authorLink.getText() authorLocalId = authorLink.get('href')[len(authorHrefPrefix):] self.setAuthor(fic, author, authorUrl, authorLocalId) break else: raise Exception('unable to find author: {}'.format(fic.url)) titleMatch = re.search( '<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE ) if titleMatch is None: edumpContent(str(infoPane), 'sugarquill_title') raise Exception('could not locate title') fic.title = titleMatch.group(1).replace(' ', ' ').strip() chapterOptions = infoPane.findAll('option') chapterTitles = {} for chapterOption in chapterOptions: cid = int(chapterOption.get('value')) chapterTitles[cid] = chapterOption.getText().strip() fic.chapterCount = len(chapterOptions) fic.ageRating = '<unkown>' # TODO fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing # TODO: no uniform way to detect? authorProfileHtml = scrape.scrape(authorUrl)['raw'] authorProfileHtml = authorProfileHtml.replace('\r', '') authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib') storyTables = authorSoup.findAll('table', {'width': '90%'}) ourStoryTable = None for storyTable in storyTables: storyId = None for a in storyTable.findAll('a'): if not a.get('href').startswith('read.php?storyid='): continue storyId = a.get('href')[len('read.php?storyid='):] storyId = storyId[:storyId.find('&')] storyId = str(int(storyId)) if storyId is None: continue if storyId != str(fic.localId): continue ourStoryTable = storyTable if ourStoryTable is None: raise Exception(f'unable to find story table: {fic.localId} {authorUrl}') trs = ourStoryTable.findAll('tr') if len(trs) != 3: raise Exception( f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}' ) fic.description = trs[1].find('td').getText().strip() reviewsMatch = re.search( '\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE ) if reviewsMatch is None: edumpContent(str(trs[0]), 'sugarquill_reviews') raise Exception('could not locate reviews') fic.reviewCount = int(reviewsMatch.group(1).strip()) updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2])) if updatedMatch is None: edumpContent(str(trs[2]), 'sugarquill_updated') raise Exception('could not locate last updated') fic.updated = OilTimestamp( util.parseDateAsUnix(updatedMatch.group(1), fic.fetched) ) if fic.published is None: fic.published = fic.updated fic.wordCount = 0 fic.upsert() for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) ch.title = chapterTitles[cid + 1] ch.cache() ch.upsert() chtml = ch.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup authorLid = fic.localId.split('/')[0] storyLid = fic.localId.split('/')[1] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' soup = BeautifulSoup(wwwHtml, 'html5lib') pageHeader = soup.find('div', {'class': 'page-header'}) titleH2 = pageHeader.find('h2') fic.title = titleH2.getText().strip() authorLink = pageHeader.find('a') author = authorLink.getText().strip() authorId = authorLid authorUrl = self.baseStoryUrl.format(authorLid, 'contact/') self.setAuthor(fic, author, authorUrl, authorId) divWell = soup.find('div', {'class': 'well'}) summaryQuote = divWell.find('blockquote') fic.description = str( summaryQuote.getText() ).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ') while fic.description.find(' ') != -1: fic.description = fic.description.replace(' ', ' ') fic.description = fic.description.strip() divWellText = divWell.getText().strip() match = re.search('Status:\s*([^-]*) -', divWellText) if match is not None and match.group(1) == 'In progress': fic.ficStatus = FicStatus.ongoing else: raise Exception('unable to find fic status') RegexMatcher( divWellText, { 'ageRating': ('Rating\s*:\s+([^-]+) -', str), 'chapterCount': ('Chapters\s*:\s+(\d+) -', int), 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), } ).matchAll(fic) assert (fic.chapterCount is not None) if str(fic.wordCount).find(',') != -1: fic.wordCount = int(str(fic.wordCount).replace(',', '')) wellParent = divWell.parent cid = 0 wordCount = 0 reviewCount = 0 chapterDates: List[int] = [] for child in wellParent.children: if child.name != 'p': continue cid += 1 if str(child).find('Chapter {}'.format(cid)) == -1: continue chapterLink = child.find('a') expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower() if chapterLink.get('href').lower() != expectedUrl: raise Exception('unexpected chapter url: ' + chapterLink.get('href')) chInfo = ChapterInfo() RegexMatcher( child.getText(), { 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), 'reviewCount': ('Reviews\s*:\s+([^-]+) -', int), 'updated': ('Uploaded on\s*:\s+(.+)', str), } ).matchAll(chInfo) assert (chInfo.updated is not None) if str(chInfo.wordCount).find(',') != -1: chInfo.wordCount = int(str(chInfo.wordCount).replace(',', '')) wordCount += chInfo.wordCount reviewCount += chInfo.reviewCount dt = (util.parseDateAsUnix(chInfo.updated, int(time.time()))) chapterDates += [dt] # wordCount is already set from overall metadata fic.reviewCount = reviewCount fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.upsert() for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = 'Chapter_{}'.format(cid) ch.url = self.constructUrl(fic.localId, cid) ch.upsert() return fic
def parseRussianDate(self, datestr: str) -> OilTimestamp: parts = datestr.split('.') dtstr = '{}.{}.{}'.format(parts[1], parts[0], parts[2]) uts = util.parseDateAsUnix(dtstr, int(time.time())) return OilTimestamp(uts)
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicTexts = [ # probably deleted by user 'Story Not FoundUnable to locate story. Code 1.', # probably deleted by admin 'Story Not FoundUnable to locate story. Code 2.', # unknown 'Story Not FoundStory is unavailable for reading. (A)', ] soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: for deletedFicText in deletedFicTexts: if gui_warning.get_text() == deletedFicText: if fic.ficStatus != FicStatus.complete: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) descriptionFound = False for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() descriptionFound = True break if descriptionFound == False: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 # TODO we should match this only on the section following the description matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Rated:.*Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = [] if preStoryLinks is not None: preStoryLinksLinks = preStoryLinks.find_all('a') pendingFandoms: List[Fandom] = [] for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in ffNetFandomCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/ if ( len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers") and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0 ): fIds = [int(hrefParts[2]), int(hrefParts[3])] pendingFandoms += self.handleCrossoverFandom( fic, hrefParts[1], fIds, href ) continue # if it's a regular fandom in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in ffNetFandomCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) pendingFandoms += self.handleFandom(fic, hrefParts[2]) continue util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href)) fic.upsert() poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId}) if len(poss) != 1: raise Exception(f'unable to upsert fic?') fic = poss[0] for pfandom in pendingFandoms: fic.add(pfandom) if fic.chapterCount is None: return fic chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = str(cid) ch.url = self.constructUrl(fic.localId, cid) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid) elif fic.chapterCount == 1 and cid == 1: ch.title = fic.title ch.upsert() metaSpan = profile_top.find('span', {'class': 'xgray'}) if metaSpan is not None: try: res = self.parseFicMetaSpan(metaSpan.decode_contents()) #fic.language = res["language"] # reconstruct fields = [ ('rated', 'Rated: Fiction ZZZ'), ('language', 'Language: ZZZ'), ('genres', 'Genre: ZZZ'), ('characters', 'Characters: ZZZ'), ('reviews', 'Reviews: ZZZ'), ('favorites', 'Favs: ZZZ'), ('follows', 'Follows: ZZZ'), ] rmeta = ' - '.join( [f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res] ) fic.extraMeta = rmeta publishedUts = util.parseDateAsUnix(res['published'], fic.fetched) fic.published = OilTimestamp(publishedUts) fic.updated = fic.published if 'updated' in res: updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.upsert() except Exception as e: util.logMessage( f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}' ) util.logMessage( f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}' ) pass return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? w95tables = soup.findAll('table', {'width': '95%'}) if len(w95tables) != 3: raise Exception('wrong number of w95 tables: {}'.format( len(w95tables))) ficInfoTable = w95tables[0] ficTitleH3 = ficInfoTable.find('h3') fic.title = ficTitleH3.get_text().strip() authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html) if authorUrlMatch is None: raise Exception('could not locate author url') author = authorUrlMatch.group(2) authorId = authorUrlMatch.group(1) authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId self.setAuthor(fic, author, authorUrl, authorId) # TODO: this may miss multiline summaries :( summaryMatch = re.search( '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE) if summaryMatch is None: edumpContent(html, 'siye_summary') raise Exception('could not locate summary') # alternatively: fic.description = "{no summary}" ? fic.description = summaryMatch.group(1).strip() fic.ageRating = '<unkown>' ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html) if ageRatingMatch is not None: fic.ageRating = ageRatingMatch.group(1).strip() maxChapter = 0 baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId) singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format( fic.localId) isSingleChapterFic = False allAs = soup.find_all('a') for a in allAs: href = a.get('href') if href is None: continue if not href.startswith(baseChapterHref): continue if href.startswith(singleChapterHref): isSingleChapterFic = True maxChapter = max(1, maxChapter) continue cid = int(href[len(baseChapterHref):]) maxChapter = max(cid, maxChapter) fic.chapterCount = maxChapter fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing if html.find('Story is Complete'): fic.ficStatus = FicStatus.complete updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)') minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched) maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched) for (year, month, day) in re.findall(updatedOnPattern, html): date = '{}/{}/{}'.format(year, month, day) dt = util.parseDateAsUnix(date, fic.fetched) minUpdate = min(minUpdate, dt) maxUpdate = max(maxUpdate, dt) if fic.published is None or fic.published.toUTS() > minUpdate: fic.published = OilTimestamp(minUpdate) if fic.updated is None or fic.updated.toUTS() < maxUpdate: fic.updated = OilTimestamp(maxUpdate) if fic.updated < fic.published: fic.updated = fic.published fic.wordCount = 0 wordsPattern = re.compile('(\d+) words') for (words) in re.findall(wordsPattern, html): fic.wordCount += int(words) if fic.wordCount == 0 and isSingleChapterFic: try: fic.upsert() ch1 = fic.chapter(1) ch1.cache() chtml = ch1.html() if chtml is not None: fic.wordCount = len(chtml.split()) except: pass fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') divDetails = soup.find_all('div', {'class': 'details'}) if len(divDetails) != 1: raise Exception('error: unable to find details\n') else: divDetails = divDetails[0] text = divDetails.get_text() pt_str = str(divDetails) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? divTitle = soup.find_all('div', {'class': 'title'}) if len(divTitle) == 1: fic.title = divTitle[0].get_text().strip() else: raise Exception( 'error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1) # TODO: this may not exist on fictionhunt? fic.description = 'archive of {} from fictionhunt TODO'.format( fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('- Complete -', text) if match is None: fic.ficStatus = FicStatus.ongoing else: fic.ficStatus = FicStatus.complete for a in divDetails.find_all('a'): a_href = a.get('href') if a_href.find('fanfiction.net/u/') != -1: author = a.get_text() authorUrl = a_href authorId = a_href.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: hardcode Harry Potter fanfic? return fic
def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic: # existing data is newer, do nothing if fic.fetched is not None and fic.fetched.toUTS() > ts: return fic from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html5lib') text = soup.get_text() pt_str = str(html) fic.fetched = OilTimestamp(ts) fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1, fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 for a in soup.find_all('a', {'class': 'stitle'}): fic.title = a.getText() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) for div in soup.find_all('div', {'class': 'z-padtop'}): fic.description = div.contents[0] break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in soup.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) zl = soup.find('div', {'class': 'z-list'}) fan = None if zl is None else zl.get('data-category') pendingFandoms: List[Fandom] = [] if fan is not None: pendingFandoms += self.handleFandom(fic, fan) # TODO: crossovers? #print('---') #print(fic.__dict__) #raise Exception('todo') fic.upsert() for pfandom in pendingFandoms: fic.add(pfandom) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? ficTitleDiv = soup.find('div', {'class': 'fic-title'}) fic.title = ficTitleDiv.find('h1').getText().strip() authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a') author = authorLink.getText().strip() authorUrl = self.baseUrl + authorLink.get('href') authorId = authorUrl.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) divDescription = soup.find('div', {'class': 'description'}) try: descView = HtmlView(str(divDescription), markdown=False) desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text]) fic.description = desc except: fic.description = divDescription.getText().strip() fictionInfo = str(soup.find('div', {'class': 'fiction-info'})) if fictionInfo.find('>ONGOING<') != -1: fic.ficStatus = FicStatus.ongoing elif fictionInfo.find('>COMPLETED<') != -1: fic.ficStatus = FicStatus.complete elif fictionInfo.find('>HIATUS<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>STUB<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>DROPPED<') != -1: fic.ficStatus = FicStatus.abandoned else: raise Exception('unable to find fic status') divStatsContent = soup.find('div', {'class': 'stats-content'}) followers = divStatsContent.find(text='Followers :') ul = followers.parent.parent RegexMatcher( ul.getText(), { 'followCount?': ('Followers\s+:\s+([\d,]+)', str), 'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str), } ).matchAll(fic) if str(fic.followCount).find(','): fic.followCount = int(str(fic.followCount).replace(',', '')) if str(fic.favoriteCount).find(','): fic.favoriteCount = int(str(fic.favoriteCount).replace(',', '')) tableChapters = soup.find('table', {'id': 'chapters'}) chapterLinks = tableChapters.findAll('a') chapterUrls: List[str] = [] chapterTitles: List[str] = [] for chapterLink in chapterLinks: # TODO FIXME is this inverted? if chapterLink.find('time') is not None: continue chapterUrls += [chapterLink.get('href')] chapterTitles += [chapterLink.getText().strip()] chapterDates: List[int] = [] for chapterLink in chapterLinks: if chapterLink.find('time') is None: continue timeElement = chapterLink.find('time') if timeElement.get('unixtime'): chapterDates += [int(timeElement.get('unixtime'))] else: chapterDates += [ util.parseDateAsUnix(timeElement.get('title'), fic.fetched) ] fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.chapterCount = len(chapterUrls) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.url = self.baseUrl + chapterUrls[cid - 1] if chapterUrls[cid - 1].startswith('/fiction/chapter/'): # alternate chapter syntax if the chapter itself has no slug # /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug> chapter.localChapterId = ( chapterUrls[cid - 1].split('/')[3].split('?')[0] ) else: # standard chapter syntax # /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug> chapter.localChapterId = chapterUrls[cid - 1].split('/')[5] chapter.title = chapterTitles[cid - 1] if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) if chapter.html() is None: chapter.cache() chapter.upsert() chtml = chapter.html() if chtml is not None: wordCount += len(chtml.split()) fic.wordCount = wordCount return fic
def extractSearchMetadata( self, html: str, metas: Dict[str, AdultFanfictionMeta] = {} ) -> Dict[str, AdultFanfictionMeta]: from bs4 import BeautifulSoup archiveFandomMap = { 'naruto': 'Naruto', 'hp': 'Harry Potter', 'xmen': 'X-Men', } locatedFandomMap = [ ('Mass Effect', 'Mass Effect'), ('Metroid', 'Metroid'), ('Pokemon', 'Pokemon'), ('Sonic', 'Sonic'), ('Witcher 3: Wild Hunt', 'Witcher'), ] chars = [ 'Harry', 'Hermione', 'Snape', 'Draco', 'Sirius', 'Remus', 'Lucius', 'Ron', 'Voldemort', 'Ginny', 'Charlie', 'Lily', 'Scorpius', 'James', 'George', 'Fred', 'Narcissa', 'Blaise', 'Bill', 'Luna', 'Albus', 'Severus', 'Fenrir', 'Tonks', 'Rose', 'Neville', 'Cho', 'Cedric', 'Tom', 'Seamus', 'Pansy', 'Bellatrix', 'Viktor', 'Percy', 'Dudley', 'McGonagall', 'Lavendar', 'Dumbledore', 'Naruto', 'Sasuke', 'Kakashi', 'Iruka', 'Sakura', 'Itachi', 'Gaara', 'Shikamaru', 'Neji', 'Rock Lee', 'Hinata', 'Ino', 'Shino', 'Danzo' ] spaceSqeeezeRe = re.compile('\s+') searchSoup = BeautifulSoup(html, 'html5lib') resultTables = searchSoup.findAll('table', {'width': '90%'}) for resultTable in resultTables: meta = AdultFanfictionMeta() links = resultTable.findAll('a') titleLink = links[0] meta.title = titleLink.getText() meta.url = titleLink.get('href') authorLink = links[1] meta.author = authorLink.getText().strip() meta.authorUrl = authorLink.get('href').strip() assert (meta.authorUrl is not None) meta.authorId = meta.authorUrl.split('=')[-1] trs = resultTable.findAll('tr') publishedText = trs[0].getText() RegexMatcher(publishedText, { 'published': ('Published\s+:\s+(.+)', str), }).matchAll(meta) assert (meta.published is not None) meta.published = util.parseDateAsUnix(meta.published, int(time.time())) extendedMetadata = trs[1].getText() util.logMessage(extendedMetadata, 'tmp_e_meta_aff.log') # TODO: dragon prints are actually views, not followCount/favoriteCount RegexMatcher( extendedMetadata, { 'chapterCount': ('Chapters\s*:\s*(\d+)', int), 'updated': ('Updated\s+:\s+(.+?)-:-', str), 'reviewCount?': ('Reviews\s+:\s+(\d+)', int), 'views?': ('Dragon prints\s+:\s+(\d+)', int), 'located?': ('Located\s*:\s*(.*)', str) }).matchAll(meta) assert (meta.updated is not None) meta.updated = util.parseDateAsUnix(meta.updated, int(time.time())) meta.description = str(trs[2]) meta.description = util.filterUnicode(meta.description) meta.description = spaceSqeeezeRe.sub(' ', meta.description) meta.setTags(str(trs[3])) if 'COMPLETE' in meta.tags or 'Complete.' in meta.tags: meta.ficStatus = FicStatus.complete assert (meta.url is not None) ficId = FicId.tryParseUrl(meta.url) assert (ficId is not None) meta.localId = ficId.localId meta.archive = meta.localId.split('/')[0] meta.storyNo = meta.localId.split('/')[1] if meta.archive.lower() in archiveFandomMap: meta.fandoms += [archiveFandomMap[meta.archive.lower()]] meta.located = meta.located or '' loclow = meta.located.lower() for locFan in locatedFandomMap: if loclow.endswith(locFan[0].lower()): meta.fandoms += [locFan[1]] for c1 in chars: for c2 in chars: if loclow.endswith('{}/{}'.format(c1, c2).lower()): meta.chars += [c1, c2] # TODO: try parse category, get chars #meta.info() if meta.url not in metas or meta.isNewerThan(metas[meta.url]): metas[meta.url] = meta return metas
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? pagetitle = soup.find(id='pagetitle') aTags = pagetitle.findAll('a') author = None for a in aTags: href = a.get('href') if href.startswith('viewstory'): fic.title = a.contents[0].strip() elif href.startswith('viewuser.php?uid='): author = a.contents[0] authorUrl = self.baseUrl + href authorId = str(int(href[len('viewuser.php?uid='):])) self.setAuthor(fic, author, authorUrl, authorId) if fic.title is None: raise Exception('unable to find title') if author is None: raise Exception('unable to find author') lines = html.replace('\r', '\n').replace('<', '\n<').split('\n') inDescription = False description = '' for line in lines: cur = line.strip() if cur.find('!-- SUMMARY START --') != -1: inDescription = True elif cur.find('!-- SUMMARY END --') != -1: inDescription = False if inDescription == True: description += cur + '\n' fic.description = description fic.ageRating = '<unkown>' infoBlock = None infoText = None blocks = soup.findAll('div', {'class': 'block'}) for block in blocks: title = block.find('div', {'class': 'title'}) if title is None: continue if title.contents[0] != 'Story Information': continue infoBlock = block infoText = block.get_text() break else: raise Exception('unable to find info text') matcher = RegexMatcher( infoText, { 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Word count:\s+(\S+)', int), }) matcher.matchAll(fic) sortDiv = soup.find(id='sort') match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text()) if match is not None: fic.reviewCount = int(match.group(1).replace(',', '')) else: fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 infoBlockHtml = str(infoBlock) match = re.search( '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->', infoBlockHtml) if match is not None: publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.published = OilTimestamp(publishedUts) match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->', infoBlockHtml) if match is not None: updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.updated is None: fic.updated = fic.published match = re.search('Completed:\s+(\S+)', infoText) if match is not None: complete = match.group(1) if complete == 'No': fic.ficStatus = FicStatus.ongoing elif complete == 'Yes': fic.ficStatus = FicStatus.complete else: raise Exception('unknown complete value: {}'.format(complete)) match = re.search('Crossovers', infoText) if match is not None: pass # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url)) else: # otherwise not a crossover and just harry potter fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'}) if len(storyMainInfo) != 1: raise Exception('unable to find main story info') storyMainInfo = storyMainInfo[0] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=" for a in soup.findAll('a'): href = a.get('href') if (not href.startswith(disclaimerJs) and href != '?psid={}'.format(fic.localId)): continue fic.title = a.getText() break else: raise Exception('error: unable to find title') fic.url = self.constructUrl(fic.localId) storySummaryTable = soup.findAll('table', {'class': 'storysummary'}) if len(storySummaryTable) != 1: raise Exception('cannot find story summary table') storySummaryTable = storySummaryTable[0] fic.description = (storySummaryTable.getText().strip()) if fic.description is None: raise Exception('error: unable to find description') # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 text = storyMainInfo.getText().replace('\xa0', ' ') matcher = RegexMatcher( text, { 'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str), 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\d+)', int), 'reviewCount': ('Story Reviews:\s*(\d+)', int), 'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int), 'updated': ('Last Updated:\s+(\S+)', str), 'published': ('First Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)', text) if match is None: raise Exception('cannot find write status') status = match.group(1) if status == 'Completed': fic.ficStatus = FicStatus.complete elif status == 'Work In Progress': fic.ficStatus = FicStatus.ongoing # should these be abandoned? elif status == 'Abandoned': fic.ficStatus = FicStatus.abandoned else: raise Exception('unknown status: {}'.format(status)) for a in soup.findAll('a'): a_href = a.get('href') if a_href.startswith('viewuser.php?showuid='): author = a.get_text() authorUrl = self.baseUrl + '/' + a_href authorId = a_href[len('viewuser.php?showuid='):] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: chars/pairings? fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicText = 'Story Not FoundUnable to locate story. Code 1.' soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: if gui_warning.get_text() == deletedFicText: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(\S+)', text) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(1) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}'.format(status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = preStoryLinks.find_all('a') for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in fictionPressCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a regular genre in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in fictionPressCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) # ensure it's in our whitelist if hrefParts[2] not in fictionPressGenres: util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}') continue fic.add(Fandom.define(hrefParts[2])) continue util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}') continue fic.upsert() chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1) elif fic.chapterCount == 1 and cid == 0: ch.title = fic.title ch.upsert() return fic