def parseItem(self, elem): title = elem.find("title") url = elem.find("enclosure") pubdate = elem.find("pubDate") if title is None or url is None or pubdate is None: raise IndexerResultParsingRowException( "Unable to find title, url or date") entry = self.create_nzb_search_result() if "password protect" in title.text.lower( ) or "passworded" in title.text.lower(): entry.passworded = True p = re.compile(r'"(.*)"') m = p.search(title.text) if m: entry.title = m.group(1) else: entry.title = title.text entry.link = url.attrib["url"] entry.size = int(url.attrib["length"]) entry.indexer = self.name entry.category = getUnknownCategory() entry.details_link = elem.find("link").text entry.indexerguid = elem.find("guid").text[ -8:] # GUID looks like "http://www.nzbclub.com/nzb_view58556415" of which we only want the last part description = elem.find("description").text description = urlparse.unquote(description).replace("+", " ") if re.compile(r"\d NFO Files").search( description): # [x NFO Files] is missing if there is no NFO entry.has_nfo = NzbSearchResult.HAS_NFO_YES else: entry.has_nfo = NzbSearchResult.HAS_NFO_NO m = self.group_pattern.search(description) if m: entry.group = m.group(1).strip() m = self.poster_pattern.search(description) if m: entry.poster = m.group(1).strip() try: pubdate = arrow.get(pubdate.text, 'ddd, DD MMM YYYY HH:mm:ss Z') entry.epoch = pubdate.timestamp entry.pubdate_utc = str(pubdate) entry.age_days = (arrow.utcnow() - pubdate).days entry.pubDate = pubdate.format("ddd, DD MMM YYYY HH:mm:ss Z") except Exception: self.error("Unable to parse pubdate %s" % pubdate.text) raise IndexerResultParsingRowException("Unable to parse date") return entry
def parse_date(self, agetd, entry): m = self.age_pattern.search(agetd.text) days = None hours = None if m: days = int(m.group("days1")) hours = int(m.group("days2")) * 2.4 else: p = re.compile(r"(?P<hours>\d+) hours?") m = p.search(agetd.text) if m: days = 0 hours = int(m.group("hours")) if hours is not None: pubdate = arrow.utcnow().replace(days=-days, hours=-1) # hours because of timezone change below if hours > 0: pubdate = pubdate.replace(hours=-hours) pubdate = pubdate.to("+01:00") # nzbindex server time, I guess? entry.epoch = pubdate.timestamp entry.pubdate_utc = str(pubdate) entry.age_days = (arrow.utcnow() - pubdate).days entry.age = str(entry.age_days) + "d" entry.age_precise = True # Precise to 2.4 hours, should be enough for duplicate detection entry.pubDate = pubdate.format("ddd, DD MMM YYYY HH:mm:ss Z") else: self.error("Found no age info in %s" % str(agetd)) raise IndexerResultParsingRowException("Unable to parse age")
def parseRow(self, row): tds = list(row.find_all("td")) if len(tds) != 5: # advertisement raise IndexerResultParsingRowException("Ad") entry = self.create_nzb_search_result() entry.indexerguid = row.find("input")["value"] infotd = tds[1] if "password protected" in infotd.text.lower(): entry.passworded = True title = infotd.find("label").text title = title.replace("\n", "") title = re.sub(" +", " ", title) m = self.title_pattern.search(title) if m: entry.title = m.group(1) else: entry.title = title entry.title = self.cleanUpTitle(entry.title) info = infotd.find("div", class_="fileinfo") if info is not None and re.compile(r"\d NFO").search( info.text): # 1 nfo file is missing if there is no NFO entry.has_nfo = NzbSearchResult.HAS_NFO_YES else: entry.has_nfo = NzbSearchResult.HAS_NFO_NO entry.poster = self.parse_poster(infotd) link = infotd.findAll('a', text=re.compile('Download')) if link is not None and len(link) == 1: entry.link = link[0]["href"] else: self.debug("Did not find link in row") complete = infotd.find("span", class_="complete") if complete: entry.files = complete.text[0:complete.text.find(" ")] entry.category = getUnknownCategory() sizetd = tds[2] entry.size = self.parse_size(sizetd) grouptd = tds[3] group = grouptd.text.replace("\n", "").replace("a.b.", "alt.binaries.").strip() entry.group = group agetd = tds[4] self.parse_date(agetd, entry) collection_links = infotd.findAll("a", href=True, text="View collection") if collection_links is not None and len(collection_links) > 0: entry.details_link = collection_links[0].attrs["href"] return entry
def parseRow(self, row): entry = self.create_nzb_search_result() title = row.find('span', attrs={'class': 's'}) if title is None: self.debug("Ignored entry because it has no title") raise IndexerResultParsingRowException("No title found") title = title.text if "password protect" in title.lower() or "passworded" in title.lower( ): entry.passworded = True m = self.title_pattern.search(title) if m: entry.title = m.group(1) else: entry.title = title entry.indexerguid = row.find("input", attrs={"type": "checkbox"})["name"] entry.link = self.get_nzb_link(entry.indexerguid, None) info = row.find("span", attrs={"class": "d"}) if info is None: self.debug("Ignored entry because it has no info") raise IndexerResultParsingRowException("No info found") collection_link = info.find( "a" )["href"] # '/?b=MARVELS.AVENGERS.AGE.OF.ULTRON.3D.TOPBOT.TrueFrench.1080p.X264.A&g=alt.binaries.movies.mkv&p=Ramer%40marmer.com+%28Clown_nez%29&max=250' entry.details_link = "%s%s" % (self.host, collection_link) m = self.goup_pattern.search(collection_link) if m: entry.group = m.group(1).strip() m = self.poster_pattern.search(collection_link) if m: poster = m.group(1).strip() entry.poster = urlparse.unquote(poster).replace("+", " ") # Size m = self.size_pattern.search(info.text) if not m: self.debug("Unable to find size information in %s" % info.text) else: size = float(m.group("size")) unit = m.group("unit") if unit == "GB": size = size * 1024 * 1024 * 1024 elif unit == "KB": size *= 1024 elif unit == "MB": size = size * 1024 * 1024 entry.size = int(size) entry.category = getUnknownCategory() if self.nfo_pattern.search( info.text): # 1 nfo file is missing if there is no NFO entry.has_nfo = NzbSearchResult.HAS_NFO_YES else: entry.has_nfo = NzbSearchResult.HAS_NFO_NO # Age try: pubdate = re.compile(r"(\d{1,2}\-\w{3}\-\d{4})").search( row.text).group(1) pubdate = arrow.get(pubdate, "DD-MMM-YYYY") entry.epoch = pubdate.timestamp entry.pubdate_utc = str(pubdate) entry.age_days = (arrow.utcnow() - pubdate).days entry.age_precise = False entry.pubDate = pubdate.format("ddd, DD MMM YYYY HH:mm:ss Z") except Exception as e: self.error("Unable to find age in %s" % row.find_all("td")[-1:][0].text) raise IndexerResultParsingRowException("Unable to parse age") return entry
def process_query_result(self, xml_response, searchRequest, maxResults=None): self.debug("Started processing results") if "0 results found" in xml_response: return IndexerProcessingResult(entries=[], queries=[], total=0, total_known=True, has_more=False, rejected=0) if "search to short" in xml_response: self.info("omgwtf says the query was too short") return IndexerProcessingResult(entries=[], queries=[], total=0, total_known=True, has_more=False, rejected=0) entries = [] countRejected = 0 try: tree = ET.fromstring(xml_response) except Exception: self.exception("Error parsing XML: %s..." % xml_response[:500]) raise IndexerResultParsingException("Error parsing XML", self) if tree.tag == "xml": total = int(tree.find("info").find("results").text) current_page = int(tree.find("info").find("current_page").text) total_pages = int(tree.find("info").find("pages").text) has_more = current_page < total_pages for item in tree.find("search_req").findall("post"): entry = self.parseItem(item) accepted, reason = self.accept_result(entry, searchRequest, self.supportedFilters) if accepted: entries.append(entry) else: countRejected += 1 self.debug("Rejected search result. Reason: %s" % reason) return IndexerProcessingResult(entries=entries, queries=[], total=total, total_known=True, has_more=has_more, rejected=countRejected) elif tree.tag == "rss": for item in tree.find("channel").findall("item"): entry = self.create_nzb_search_result() indexerguid = item.find("guid").text m = self.regexGuid.match(indexerguid) if m: entry.indexerguid = m.group(1) else: self.warn("Unable to find GUID in " + indexerguid) raise IndexerResultParsingRowException("Unable to find GUID") entry.title = item.find("title").text description = item.find("description").text m = self.regexGroup.match(description) if m: entry.group = m.group(1) else: self.warn("Unable to find group in " + description) raise IndexerResultParsingRowException("Unable to find usenet group") entry.size = long(item.find("enclosure").attrib["length"]) entry.pubDate = item.find("pubDate").text pubdate = arrow.get(entry.pubDate, 'ddd, DD MMM YYYY HH:mm:ss Z') entry.epoch = pubdate.timestamp entry.pubdate_utc = str(pubdate) entry.age_days = (arrow.utcnow() - pubdate).days entry.precise_date = True entry.link = item.find("link").text entry.has_nfo = NzbSearchResult.HAS_NFO_MAYBE categoryid = item.find("categoryid").text entry.details_link = self.get_details_link(entry.indexerguid) if categoryid in omgwtf_to_categories.keys(): entry.category = getCategoryByName(omgwtf_to_categories[categoryid]) else: entry.category = getUnknownCategory() accepted, reason = self.accept_result(entry, searchRequest, self.supportedFilters) if accepted: entries.append(entry) else: countRejected += 1 self.debug("Rejected search result. Reason: %s" % reason) return IndexerProcessingResult(entries=entries, queries=[], total=len(entries), total_known=True, has_more=False, rejected=countRejected) else: self.warn("Unknown response type: %s" % xml_response[:100]) return IndexerProcessingResult(entries=[], queries=[], total=0, total_known=True, has_more=False, rejected=countRejected)