def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: return [] if not float(ratingtg['score']) >= MIN_RATING: return [] if not titletg: return [] if not authortg: return [] if not ratingtg: return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' pkt = msgpackers.sendSeriesInfoPacket(seriesmeta) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) retval.append(msg) # Do not add series without 3 chapters. if len(retval) < 3: return [] if not retval: return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_="fanfic_title_div").get_text() author = soup.find("div", class_="fanfic_author_div").get_text() ratingtg = soup.find("div", class_="fanfic_title_wrapper") ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)] if not ratingtg: ratingtg = "" else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find("div", class_="fanfic_synopsis") if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith("categories:"): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta["title"] = title seriesmeta["author"] = author seriesmeta["tags"] = tags seriesmeta["homepage"] = "" seriesmeta["desc"] = " ".join([str(para) for para in desc]) seriesmeta["tl_type"] = "oel" seriesmeta["sourcesite"] = "JapTem" meta_pkt = msgpackers.sendSeriesInfoPacket(seriesmeta) extra = {} extra["tags"] = tags extra["homepage"] = "" extra["sourcesite"] = "JapTem" retval = [] chapters = soup.find("ul", class_="fanfic_chapter_list") volumes = chapters.find_all("li", class_="fanfic_volume") for volume in volumes: releases = volume.find_all("li", class_="fanfic_chapter") for release in releases: chp_title = release.find("a") vol_str = volume.find("div", class_="fanfic_volume_title").get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item["srcname"] = "JapTem" raw_item["published"] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a["href"]) raw_item["linkUrl"] = releaseurl msg = msgpackers.buildReleaseMessage( raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type="oel", extraData=extra ) msg = msgpackers.createReleasePacket(msg) retval.append(msg) if not retval: return [] retval.append(meta_pkt) # return [] return retval
def extractSeriesReleases(self, seriesPageUrl, metadata, soup): title = metadata['title'] author = metadata['user']['name'] desc = metadata['description'] tags = metadata['tags'] # Apparently the description is rendered in a <pre> tag. # Huh? desc = markdown.markdown(desc, extensions=["linkify"]) title = title.strip() # Siiiiiigh. Really? title = title.replace("[#wattys2015]", "") title = title.replace("(Wattys2015) ", "") title = title.replace("#Wattys2015", "") title = title.replace("Wattys2015", "") title = title.strip() if metadata['numParts'] < 3: return [] if metadata['voteCount'] < 100: return [] # Language ID 1 is english. if metadata['language']['id'] != 1: return [] # Allow blocking of item by ID if metadata['id'] in BLOCK_IDS: return [] # for some particularly stupid reasons, the item category tag is # not included in the metadata. # therefore, we parse it out from the page manually. tagdiv = soup.find("div", class_="tags") if tagdiv: for tag in tagdiv.find_all("a", class_='tag'): tags.append(tag.get_text()) tags = list(set([item.lower().strip().replace(" ", " ").replace(" ", "-") for item in tags])) # Mask any content with any of the blocked tags. if any([item in tags for item in WATTPAD_MASKED_TAGS]): self.log.warning("Item has a masked tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return # And check that at least one of the target tags is present. if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]): self.log.warning("Item missing required tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return seriesmeta = {} extra = {} extra['tags'] = tags[:] extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'WattPad' retval = [] index = 1 valid = 1 for release in metadata['parts']: chp_title = release['title'] dt = datetime.datetime.strptime(release['modifyDate'], "%Y-%m-%dT%H:%M:%SZ" ) reldate = calendar.timegm(dt.timetuple()) raw_item = {} raw_item['srcname'] = "WattPad" raw_item['published'] = reldate raw_item['linkUrl'] = release['url'] msg = msgpackers.buildReleaseMessage(raw_item, title, None, index, None, author=author, postfix=chp_title, tl_type='oel', extraData=extra, beta=IS_BETA) retval.append(msg) # Check if there was substantive structure in the chapter # name. Used as a crude heuristic for chapter validity. # vol, chp, frag, post = extractTitle(chp_title) # if any((vol, chp, frag)): # # print("Valid: ", (vol, chp, frag)) # valid += 1 index += 1 # if valid < (index/2): # print("Half the present chapters are have no numeric content?") # return [] # Don't send the series metadata if we didn't find any chapters. if not retval: print("No chapters!") return [] seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = desc seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'WattPad' pkt = msgpackers.sendSeriesInfoPacket(seriesmeta, beta=IS_BETA) self.log.info("Wattpad scraper generated %s amqp messages!", len(retval) + 1) self.amqp_put_item(pkt) return retval