def extractSeriesReleases(self, row): tds = row.find_all("td") if len(tds) != 4: self.log.warning( "Row does not have four <td> tags! Don't know how to handle") pdtag = row.prettify() for line in pdtag.split("\n"): self.log.warning(line) return None title_td, ch_td, trans_td, release_td = tds title = title_td.find("div", class_='ellipsis-1').get_text(strip=True) author = trans_td.get_text(strip=True) if not title: return None if not author: return None # Cripes this is probably brittle series_type = "translated" if "," in author else "oel" reldate = float(release_td.span['data-timestamp']) chp_title = ch_td.get_text(strip=True) vol, chp, frag, _ = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = 'FoxTeller' raw_item['published'] = reldate raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com", ch_td.a['href']) raw_msg = msgpackers._buildReleaseMessage( raw_item=raw_item, series=title, vol=vol, chap=chp, frag=frag, # author = author, postfix=chp_title, tl_type=series_type, # matchAuthor = True, # looseMatch = True ) msg = msgpackers.createReleasePacket(raw_msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0] probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def dispatchNanoDesu(self, netloc, itemurl, itemtxt): itemtitle = NANO_DESU_MAP[netloc] vol, chp, frag, post = extractTitle(itemtxt) if not (vol or chp): return None raw_item = {} raw_item['srcname'] = "Nano Desu" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split( "BY")[0] probSeries = titleonly.lower().split("volume")[0].split( "chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h2") authortg = header.find("h4") authortg.find("span").decompose() ratingtg_type_1 = soup.find("div", class_='rating') ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score')) if ratingtg_type_1: startg = ratingtg_type_1.find("span", class_='star') elif ratingtg_type_2: # print(ratingtg_type_2) starcontainer = ratingtg_type_2.find_next_sibling("li") if not starcontainer: self.log.error("Could not find rating tag (starcontainer)!") return [] startg = starcontainer.find("span", class_='star') if not startg: self.log.error("Could not find rating tag (startg)!") return [] else: self.log.error("Could not find rating tag!") return [] ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)] rating = ratingcls[0].split("-")[-1] rating = float(rating) / 10 rating = rating * 2 # Normalize to 1-10 scale # print(startg['class']) if not ratingcls: return [] if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('div', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url" : True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.") else: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h4", class_='seriestitle') altnametg = soup.find("div", id='editassociated') descrtg = soup.find("div", id='editdescription') link_sets = { 'authortg' : soup.find("div", id='showauthors'), 'artisttg' : soup.find("div", id='showartists'), 'langtg' : soup.find("div", id='showlang'), 'genretg' : soup.find("div", id='seriesgenre'), 'tagstg' : soup.find("div", id='showtags'), 'typetg' : soup.find("div", id='showtype'), 'orig_pub_tg' : soup.find("div", id='showopublisher'), 'eng_pub_tg' : soup.find("div", id='showepublisher'), } text_sets = { 'transcompletetg' : soup.find("div", id='showtranslated'), 'yeartg' : soup.find("div", id='edityear'), 'coostatustg' : soup.find("div", id='editstatus'), 'licensedtg' : soup.find("div", id='showlicensed'), } if not titletg: self.log.warn("Could not find item title!") return [] if not altnametg: self.log.warn("Could not find alt-name container tag!") return [] if not descrtg: self.log.warn("Could not find description container tag!") return [] data_sets = {} for key in list(link_sets.keys()): if not link_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [tag.get_text() for tag in link_sets[key].find_all("a")] for key in list(text_sets.keys()): if not text_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [tmp.strip() for tmp in text_sets[key].contents if isinstance(tmp, bs4.NavigableString)] title = titletg.get_text().strip() data_sets['title'] = title data_sets['altnames'] = [tmp.strip() for tmp in altnametg.contents if isinstance(tmp, bs4.NavigableString)] # Scrub incoming markup for key in list(data_sets.keys()): if isinstance(data_sets[key], list): data_sets[key] = [bleach.clean(val, tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() for val in data_sets[key]] else: data_sets[key] = bleach.clean(data_sets[key], tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() if data_sets['yeartg'] and data_sets['yeartg'][0]: print("Non-null data_sets['yeartg']:", data_sets['yeartg']) tmp_d = datetime.datetime(year=int(data_sets['yeartg'].pop()), month=1, day=1) data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple()) else: data_sets['yeartg'] = None { # 'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'], # 'orig_pub_tg': ['Media Factory'], # 'eng_pub_tg': [], # 'typetg': ['Web Novel'], # 'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'], # 'licensedtg': ['No'], # 'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'], # 'authortg': ['Iida K'], # 'artisttg': ['Shimotsuki Eito'], # 'title': 'Sendai Yuusha wa Inkyou Shitai', # 'description': '<p>\n Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>', # 'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'], # 'langtg': ['Japanese'], # 'yeartg': ['2013'] 'transcompletetg': ['No'], } data_sets['description'] = bleach.clean(descrtg.prettify(), tags=['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'p'], strip=True).strip() series_message = { 'update_only' : False, 'sourcesite' : "NovelUpdates", 'title' : data_sets['title'], 'alt_titles' : data_sets['altnames'] + [data_sets['title'], ], 'desc' : data_sets['description'], # 'homepage' : data_sets[''], 'author' : data_sets['authortg'], 'illust' : data_sets['artisttg'], 'pubdate' : data_sets['yeartg'], 'pubnames' : data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'], # 'sourcesite' : data_sets[''], 'tags' : data_sets['tagstg'], # AFICT, NovelUpdates doesn't have any english items, but wth. 'tl_type' : "translated" if 'English' not in data_sets['langtg'] else "oel", # New: 'coostate' : data_sets['coostatustg'], 'type' : data_sets['typetg'], 'genres' : data_sets['genretg'], 'licensed' : data_sets['licensedtg'], 'transcomplete' : data_sets['transcompletetg'], } pkt = msgpackers.createSeriesInfoPacket(series_message, matchAuthor=True, beta=self.is_beta) # print(pkt) extra = {} extra['tags'] = data_sets['tagstg'] # extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'Unknown' chapter_tbl = soup.find("table", class_='tablesorter') releases = chapter_tbl.find_all("tr") retval = [] for release in releases: items = release.find_all("td") if len(items) == 0: continue date_tg, group_tg, chp_tg = items rel = datetime.datetime.strptime(date_tg.get_text(), '%m/%d/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_tg.get_text().strip() group_name = group_tg.get_text().strip() vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = msgpackers.fixSmartQuotes(group_name) raw_item['published'] = reldate raw_item['linkUrl'] = chp_tg.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=data_sets['authortg'], postfix=chp_title, tl_type='translated', extraData=extra, matchAuthor=True) retval.append(msg) missing_chap = 0 for item in retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(retval): unnumbered = (missing_chap/len(retval)) * 100 if len(retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in retval: item['vol'] = None item['chp'] = chap chap += 1 # # Do not add series without 3 chapters. # if len(retval) < 3: # self.log.info("Less then three chapters!") # return [] if not retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(pkt) # return [] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [ item for item in ratingtg.contents if "Rating" in str(item) ] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem Fanfic' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem Fanfic' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = 'JapTem Fanfic' raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) msg = msgpackers.createReleasePacket(raw_msg) retval.append(msg) if not retval: return [] self.amqp_put_item(meta_pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h1") authortg = header.find("h4") authortg.find("span").decompose() rating_val = soup.find("meta", property='books:rating:value') rating_scale = soup.find("meta", property='books:rating:scale') print("Rating value:", rating_val) print("Rating scale:", rating_scale) if not rating_val or not rating_scale: return [] rval_f = float(rating_val.get('content', "0")) rscale_f = float(rating_scale.get('content', "999999")) rating = 5 * (rval_f / rscale_f) print("Float rating: ", rating) if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('span', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace( " ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url": True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap / len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning( "Item numbering force-overridden! Adding simple sequential chapter numbers." ) else: self.log.warning( "Item seems to not have numbered chapters. Adding simple sequential chapter numbers." ) chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] # self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = "JapTem" raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) msg = msgpackers.createReleasePacket(msg) retval.append(msg) if not retval: return [] retval.append(meta_pkt) # return [] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: self.log.info("Could not find rating tag!") return [] rating = float(ratingtg['score']) if not rating >= MIN_RATING and rating != 0.0: self.log.info("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.info("Could not find title tag!") return [] if not authortg: self.log.info("Could not find author tag!") return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') raw_retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if len(raw_retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: return [] if not float(ratingtg['score']) >= MIN_RATING: return [] if not titletg: return [] if not authortg: return [] if not ratingtg: return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' pkt = msgpackers.sendSeriesInfoPacket(seriesmeta) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) retval.append(msg) # Do not add series without 3 chapters. if len(retval) < 3: return [] if not retval: return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: self.log.info("Could not find rating tag!") return [] rating = float(ratingtg['score']) if not rating >= MIN_RATING and rating != 0.0: self.log.info("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.info("Could not find title tag!") return [] if not authortg: self.log.info("Could not find author tag!") return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) missing_chap = 0 for item in retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(retval): unnumbered = (missing_chap/len(retval)) * 100 if len(retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(retval) < 3: self.log.info("Less then three chapters!") return [] if not retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(pkt) return retval