def cache_returned_values(*args, **kwargs): to_hash = "" dbfile = False for a in args: # The path to the db file which we will use for caching is always one of the arguments if isinstance(a, str) and "FeedCrawler.db" in a: dbfile = a # Ignore the scraper object when caching if not isinstance(a, cloudscraper.CloudScraper): # convert all arguments to hashable strings to_hash += codecs.encode(pickle.dumps(a), "base64").decode() # This hash is based on all arguments of the request hashed = hashlib.sha256(to_hash.encode('ascii', 'ignore')).hexdigest() if dbfile: # Check if there is a cached request for this hash cached = FeedDb(dbfile, 'cached_requests').retrieve(hashed) if cached: # Unpack and return the cached result instead of processing the request return pickle.loads(codecs.decode(cached.encode(), "base64")) else: # value = func(*args, **kwargs) FeedDb(dbfile, 'cached_requests').store( hashed, codecs.encode(pickle.dumps(value), "base64").decode()) return value raise DbFileMissingExpection(str(args[0]))
def __init__(self, configfile, dbfile, device, logging, scraper, filename): self.configfile = configfile self.dbfile = dbfile self.device = device self.hostnames = CrawlerConfig('Hostnames', self.configfile) self.url = self.hostnames.get('sj') self.filename = filename if "List_ContentAll_Seasons" in self.filename: self.config = CrawlerConfig("ContentAll", self.configfile) else: self.config = CrawlerConfig("ContentShows", self.configfile) self.feedcrawler = CrawlerConfig("FeedCrawler", self.configfile) self.hevc_retail = self.config.get("hevc_retail") self.retail_only = self.config.get("retail_only") self.hoster_fallback = self.config.get("hoster_fallback") self.hosters = CrawlerConfig("Hosters", configfile).get_section() self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug self.scraper = scraper self.db = FeedDb(self.dbfile, 'FeedCrawler') self.quality = self.config.get("quality") self.prefer_dw_mirror = self.feedcrawler.get("prefer_dw_mirror") self.cdc = FeedDb(self.dbfile, 'cdc') self.last_set = self.cdc.retrieve(self._INTERNAL_NAME + "Set-" + self.filename) self.last_sha = self.cdc.retrieve(self._INTERNAL_NAME + "-" + self.filename) self.headers = {'If-Modified-Since': str(self.cdc.retrieve(self._INTERNAL_NAME + "Headers-" + self.filename))} self.settings_array = ["quality", "rejectlist", "regex", "hevc_retail", "retail_only", "hoster_fallback"] self.settings = [] self.settings.append(self.feedcrawler.get("english")) self.settings.append(self.feedcrawler.get("surround")) self.settings.append(self.feedcrawler.get("prefer_dw_mirror")) self.settings.append(self.hosters) for s in self.settings_array: self.settings.append(self.config.get(s)) self.mediatype = "Serien" self.listtype = "" self.empty_list = False if self.filename == 'List_ContentShows_Seasons_Regex': self.listtype = " (Staffeln/RegEx)" elif self.filename == 'List_ContentAll_Seasons': self.seasonssource = self.config.get('seasonssource').lower() self.listtype = " (Staffeln)" elif self.filename == 'List_ContentShows_Shows_Regex': self.listtype = " (RegEx)" list_content = shared_shows.get_series_list(self) if list_content: self.pattern = r'^(' + "|".join(list_content).lower() + ')' else: self.empty_list = True self.day = 0 self.get_feed_method = j_releases_to_feedparser_dict self.parse_download_method = j_parse_download
def remove_decrypt(title, dbfile): try: all_titles = FeedDb(dbfile, 'to_decrypt').retrieve_all_titles() for t in all_titles: if t[0].strip() == title.strip(): FeedDb(dbfile, 'to_decrypt').delete(t[0]) return True except: pass return False
def __init__(self, configfile, dbfile, device, logging, scraper): self.configfile = configfile self.dbfile = dbfile self.device = device self.config = CrawlerConfig("CustomDD", self.configfile) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug self.scraper = scraper self.db = FeedDb(self.dbfile, 'FeedCrawler')
def myjd_download(configfile, dbfile, device, title, subdir, links, password): if device: is_episode = re.findall(r'[\w.\s]*S\d{1,2}(E\d{1,2})[\w.\s]*', title) if is_episode: exists = check_failed_link_exists(links, configfile, device) if exists: broken_title = False device = exists[0] old_title = exists[3] old_path = exists[4] try: new_episode = is_episode.pop() except: broken_title = True try: old_episode = re.findall( r'[\w.\s]*(?!S\d{1,2})((?:E\d{1,2}-E\d{1,2})|(?:E\d{1,2}E\d{1,2})|(?:E\d{1,2}-\d{1,2})|(?:E\d{1,2}))[\w.\s]*', old_title).pop() combined_episodes = new_episode + '-' + old_episode except: broken_title = True if not broken_title: linkids = exists[1] package_id = [exists[2]] new_title = title.replace(new_episode, combined_episodes) new_path = old_path.replace(old_title, new_title) device = move_to_new_package(configfile, device, linkids, package_id, new_title, new_path) FeedDb(dbfile, 'crawldog').store(new_title, 'added') FeedDb(dbfile, 'crawldog').delete(old_title) return device device = download(configfile, dbfile, device, title, subdir, links, password) if device: return device return False
def get_to_decrypt(dbfile): try: to_decrypt = FeedDb(dbfile, 'to_decrypt').retrieve_all_titles() if to_decrypt: packages = [] for package in to_decrypt: title = package[0] details = package[1].split('|') url = details[0] password = details[1] packages.append({ 'name': title, 'url': url, 'password': password }) return packages else: return False except: return False
def add_decrypt(title, link, password, dbfile): try: FeedDb(dbfile, 'to_decrypt').store(title, link + '|' + password) return True except: return False
def download(payload, configfile, dbfile): hostnames = CrawlerConfig('Hostnames', configfile) sj = hostnames.get('sj') payload = decode_base64(payload).split("|") href = payload[0] title = payload[1] special = payload[2].strip().replace("None", "") series_url = 'https://' + sj + href series_info = get_url(series_url, configfile, dbfile) series_id = re.findall(r'data-mediaid="(.*?)"', series_info)[0] api_url = 'https://' + sj + '/api/media/' + series_id + '/releases' releases = get_url(api_url, configfile, dbfile) unsorted_seasons = json.loads(releases) listen = ["List_ContentShows_Shows", "List_ContentAll_Seasons"] for liste in listen: cont = ListDb(dbfile, liste).retrieve() list_title = sanitize(title) if not cont: cont = "" if list_title not in cont: ListDb(dbfile, liste).store(list_title) config = CrawlerConfig('ContentShows', configfile) english_ok = CrawlerConfig('FeedCrawler', configfile).get("english") quality = config.get('quality') ignore = config.get('rejectlist') result_seasons = {} result_episodes = {} seasons = {} for season in unsorted_seasons: if "sp" in season.lower(): seasons[season] = unsorted_seasons[season] for season in unsorted_seasons: if "sp" not in season.lower(): seasons[season] = unsorted_seasons[season] for season in seasons: releases = seasons[season] for release in releases['items']: name = release['name'].encode('ascii', errors='ignore').decode('utf-8') try: season = re.findall(r'.*\.(s\d{1,3}).*', name, re.IGNORECASE)[0] except: pass hosters = release['hoster'] try: valid = bool(release['resolution'] == quality) except: valid = re.match(re.compile(r'.*' + quality + r'.*'), name) if valid and special: valid = bool("." + special.lower() + "." in name.lower()) if valid and not english_ok: valid = bool(".german." in name.lower()) if valid: valid = False for hoster in hosters: if hoster and check_hoster( hoster, configfile) or config.get("hoster_fallback"): valid = True if valid: try: ep = release['episode'] if ep: existing = result_episodes.get(season) if existing: valid = False for e in existing: if e == ep: if rate(name, ignore) > rate( existing[e], ignore): valid = True else: valid = True if valid: existing.update({ep: name}) else: existing = {ep: name} result_episodes.update({season: existing}) continue except: pass existing = result_seasons.get(season) dont = False if existing: if rate(name, ignore) < rate(existing, ignore): dont = True if not dont: result_seasons.update({season: name}) try: if result_seasons[season] and result_episodes[season]: del result_episodes[season] except: pass success = False try: if result_seasons[season]: success = True except: try: if result_episodes[season]: success = True except: pass if success: logger.debug(u"Websuche erfolgreich für " + title + " - " + season) else: for release in releases['items']: name = release['name'].encode('ascii', errors='ignore').decode('utf-8') hosters = release['hoster'] valid = True if valid and special: valid = bool("." + special.lower() + "." in name.lower()) if valid and not english_ok: valid = bool(".german." in name.lower()) if valid: valid = False for hoster in hosters: if hoster and check_hoster( hoster, configfile) or config.get("hoster_fallback"): valid = True if valid: try: ep = release['episode'] if ep: existing = result_episodes.get(season) if existing: for e in existing: if e == ep: if rate(name, ignore) > rate( existing[e], ignore): existing.update({ep: name}) else: existing = {ep: name} result_episodes.update({season: existing}) continue except: pass existing = result_seasons.get(season) dont = False if existing: if rate(name, ignore) < rate(existing, ignore): dont = True if not dont: result_seasons.update({season: name}) try: if result_seasons[season] and result_episodes[season]: del result_episodes[season] except: pass logger.debug(u"Websuche erfolgreich für " + title + " - " + season) matches = [] for season in result_seasons: matches.append(result_seasons[season]) for season in result_episodes: for episode in result_episodes[season]: matches.append(result_episodes[season][episode]) notify_array = [] for title in matches: db = FeedDb(dbfile, 'FeedCrawler') if add_decrypt(title, series_url, sj, dbfile): db.store(title, 'added') log_entry = u'[Suche/Serie] - ' + title + ' - [SJ]' logger.info(log_entry) notify_array.append(log_entry) notify(notify_array, configfile) if not matches: return False return matches
def download(payload, device, configfile, dbfile): config = CrawlerConfig('ContentAll', configfile) db = FeedDb(dbfile, 'FeedCrawler') hostnames = CrawlerConfig('Hostnames', configfile) by = hostnames.get('by') nk = hostnames.get('nk') payload = decode_base64(payload).split("|") link = payload[0] password = payload[1] site = check_is_site(link, configfile) if not site: return False elif "DW" in site: download_method = add_decrypt_instead_of_download download_links = [link] key = payload[1] password = payload[2] else: url = get_url(link, configfile, dbfile) if not url or "NinjaFirewall 429" in url: return False download_method = myjd_download soup = BeautifulSoup(url, 'lxml') if "BY" in site: key = soup.find("small").text links = soup.find_all("iframe") async_link_results = [] for link in links: link = link["src"] if 'https://' + by in link: async_link_results.append(link) async_link_results = get_urls_async(async_link_results, configfile, dbfile) links = async_link_results[0] url_hosters = [] for link in links: if link: link = BeautifulSoup(link, 'lxml').find( "a", href=re.compile("/go\.php\?")) if link: url_hosters.append( [link["href"], link.text.replace(" ", "")]) elif "NK" in site: key = soup.find("span", {"class": "subtitle"}).text url_hosters = [] hosters = soup.find_all("a", href=re.compile("/go/")) for hoster in hosters: url_hosters.append( ['https://' + nk + hoster["href"], hoster.text]) elif "FX" in site: key = payload[1] password = payload[2] else: return False links = {} if "FX" in site: class FX: configfile = "" FX.configfile = configfile download_links = fx_get_download_links(FX, url, key) else: for url_hoster in reversed(url_hosters): try: link_hoster = url_hoster[1].lower().replace( 'target="_blank">', '').replace(" ", "-").replace("ddownload", "ddl") if check_hoster(link_hoster, configfile): link = url_hoster[0] if by in link: demasked_link = get_redirected_url( link, configfile, dbfile, False) if demasked_link: link = demasked_link links[link_hoster] = link except: pass if config.get("hoster_fallback") and not links: for url_hoster in reversed(url_hosters): link_hoster = url_hoster[1].lower().replace( 'target="_blank">', '').replace(" ", "-").replace("ddownload", "ddl") link = url_hoster[0] if by in link: demasked_link = get_redirected_url( link, configfile, dbfile, False) if demasked_link: link = demasked_link links[link_hoster] = link download_links = list(links.values()) englisch = False if "*englisch" in key.lower() or "*english" in key.lower(): key = key.replace('*ENGLISCH', '').replace("*Englisch", "").replace( "*ENGLISH", "").replace("*English", "").replace("*", "") englisch = True staffel = re.search(r"s\d{1,2}(-s\d{1,2}|-\d{1,2}|\.)", key.lower()) if download_links: if staffel: if download_method(configfile, dbfile, device, key, "FeedCrawler", download_links, password): db.store( key.replace(".COMPLETE", "").replace(".Complete", ""), 'notdl' if config.get('enforcedl') and '.dl.' not in key.lower() else 'added') log_entry = '[Suche/Staffel] - ' + key.replace( ".COMPLETE", "").replace(".Complete", "") + ' - [' + site + ']' logger.info(log_entry) notify([log_entry], configfile) return True else: retail = False if config.get('cutoff') and '.COMPLETE.' not in key.lower(): if is_retail(key, dbfile): retail = True if download_method(configfile, dbfile, device, key, "FeedCrawler", download_links, password): db.store( key, 'notdl' if config.get('enforcedl') and '.dl.' not in key.lower() else 'added') log_entry = '[Suche/Film' + ( '/Englisch' if englisch and not retail else '') + ('/Englisch/Retail' if englisch and retail else '') + ('/Retail' if not englisch and retail else '') + '] - ' + key + ' - [' + site + ']' logger.info(log_entry) notify([log_entry], configfile) return [key] else: return False
def main(): arguments = docopt(__doc__, version='FeedCrawler') print(u"┌──────────────────────────────────────────────┐") print(u" FeedCrawler " + version + " von RiX") print(u" https://github.com/rix1337/FeedCrawler") print(u"└──────────────────────────────────────────────┘") if arguments['--docker']: configpath = "/config" else: configpath = files.config(arguments['--config']) configfile = os.path.join(configpath, "FeedCrawler.ini") dbfile = os.path.join(configpath, "FeedCrawler.db") # ToDo Remove this migration from RSScrawler to Feedcrawler in next major version if os.path.exists("RSScrawler.conf"): os.remove("RSScrawler.conf") # ToDo Remove this migration from RSScrawler to Feedcrawler in next major version if os.path.exists(os.path.join(configpath, "RSScrawler.log")): os.rename(os.path.join(configpath, "RSScrawler.log"), os.path.join(configpath, "FeedCrawler.log")) print(u"Migration des RSScrawler-Logs erfolgreich!") # ToDo Remove this migration from RSScrawler to Feedcrawler in next major version if os.path.exists(os.path.join(configpath, "RSScrawler.ini")): with open(os.path.join(configpath, "RSScrawler.ini"), 'r') as file: filedata = file.read() filedata = filedata.replace("[RSScrawler]", "[FeedCrawler]") filedata = filedata.replace("[MB]", "[ContentAll]") filedata = filedata.replace("[SJ]", "[ContentShows]") filedata = filedata.replace("[DJ]", "[CustomDJ]") filedata = filedata.replace("[DD]", "[CustomDD]") with open(os.path.join(configpath, "FeedCrawler.ini"), 'w') as file: file.write(filedata) os.remove(os.path.join(configpath, "RSScrawler.ini")) print(u"Migration der RSScrawler-Einstellungen erfolgreich!") # ToDo Remove this migration from RSScrawler to Feedcrawler in next major version if os.path.exists(os.path.join(configpath, "RSScrawler.db")): os.rename(os.path.join(configpath, "RSScrawler.db"), os.path.join(configpath, "FeedCrawler.db")) FeedDb(dbfile, 'rsscrawler').rename_table('FeedCrawler') FeedDb(dbfile, 'MB_Filme').rename_table('List_ContentAll_Movies') FeedDb(dbfile, 'MB_Regex').rename_table('List_ContentAll_Movies_Regex') FeedDb(dbfile, 'MB_Staffeln').rename_table('List_ContentAll_Seasons') FeedDb(dbfile, 'SJ_Serien').rename_table('List_ContentShows_Shows') FeedDb(dbfile, 'SJ_Serien_Regex').rename_table('List_ContentShows_Shows_Regex') FeedDb(dbfile, 'SJ_Staffeln_Regex').rename_table( 'List_ContentShows_Seasons_Regex') FeedDb(dbfile, 'DJ_Dokus').rename_table('List_CustomDJ_Documentaries') FeedDb( dbfile, 'DJ_Dokus_Regex').rename_table('List_CustomDJ_Documentaries_Regex') print(u"Migration der RSScrawler-Datenbank erfolgreich!") print(u"Nutze das Verzeichnis " + configpath + u" für Einstellungen/Logs") log_level = logging.__dict__[arguments['--log-level']] if arguments[ '--log-level'] in logging.__dict__ else logging.INFO log_file = os.path.join(configpath, 'FeedCrawler.log') log_format = '%(asctime)s - %(message)s' hostnames = CrawlerConfig('Hostnames', configfile) def clean_up_hostname(host, string): if '/' in string: string = string.replace('https://', '').replace('http://', '') string = re.findall(r'([a-z-.]*\.[a-z]*)', string)[0] hostnames.save(host, string) if re.match(r'.*[A-Z].*', string): hostnames.save(host, string.lower()) if string: print(u'Hostname für ' + host.upper() + ": " + string) else: print(u'Hostname für ' + host.upper() + ': Nicht gesetzt!') return string set_hostnames = {} list_names = ['dw', 'fx', 'sj', 'dj', 'sf', 'ww', 'nk', 'by', 'dd'] for name in list_names: hostname = clean_up_hostname(name, hostnames.get(name)) if hostname: set_hostnames[name] = hostname if not arguments['--testlauf'] and not set_hostnames: print( u'Keine Hostnamen in der FeedCrawler.ini gefunden! Beende FeedCrawler!' ) time.sleep(10) sys.exit(1) disable_request_warnings(InsecureRequestWarning) if arguments['--testlauf']: device = False else: if not os.path.exists(configfile): if arguments['--docker']: if arguments['--jd-user'] and arguments['--jd-pass']: device = files.myjd_input(configfile, arguments['--port'], arguments['--jd-user'], arguments['--jd-pass'], arguments['--jd-device']) else: device = False else: device = files.myjd_input(configfile, arguments['--port'], arguments['--jd-user'], arguments['--jd-pass'], arguments['--jd-device']) else: feedcrawler = CrawlerConfig('FeedCrawler', configfile) user = feedcrawler.get('myjd_user') password = feedcrawler.get('myjd_pass') if user and password: device = get_device(configfile) if not device: device = get_if_one_device(user, password) if device: print(u"Gerätename " + device + " automatisch ermittelt.") feedcrawler.save('myjd_device', device) device = get_device(configfile) else: device = files.myjd_input(configfile, arguments['--port'], arguments['--jd-user'], arguments['--jd-pass'], arguments['--jd-device']) if not device and not arguments['--testlauf']: print( u'My JDownloader Zugangsdaten fehlerhaft! Beende FeedCrawler!') time.sleep(10) sys.exit(1) else: print(u"Erfolgreich mit My JDownloader verbunden. Gerätename: " + device.name) feedcrawler = CrawlerConfig('FeedCrawler', configfile) port = int(feedcrawler.get("port")) docker = False if arguments['--docker']: port = int('9090') docker = True elif arguments['--port']: port = int(arguments['--port']) if feedcrawler.get("prefix"): prefix = '/' + feedcrawler.get("prefix") else: prefix = '' local_address = 'http://' + common.check_ip() + ':' + str(port) + prefix if not arguments['--docker']: print(u'Der Webserver ist erreichbar unter ' + local_address) if arguments['--keep-cdc']: print(u"CDC-Tabelle nicht geleert!") else: FeedDb(dbfile, 'cdc').reset() p = multiprocessing.Process(target=web_server, args=(port, local_address, docker, configfile, dbfile, log_level, log_file, log_format, device)) p.start() if not arguments['--testlauf']: c = multiprocessing.Process(target=crawler, args=(configfile, dbfile, device, feedcrawler, log_level, log_file, log_format)) c.start() w = multiprocessing.Process(target=crawldog, args=(configfile, dbfile)) w.start() print(u'Drücke [Strg] + [C] zum Beenden') def signal_handler(): print(u'Beende FeedCrawler...') p.terminate() c.terminate() w.terminate() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) try: while True: signal.pause() except AttributeError: while True: time.sleep(1) else: crawler(configfile, dbfile, device, feedcrawler, log_level, log_file, log_format) p.terminate() sys.exit(0)
class BL: _SITE = 'FX' SUBSTITUTE = r"[&#\s/]" def __init__(self, configfile, dbfile, device, logging, scraper, filename): self.configfile = configfile self.dbfile = dbfile self.device = device self.hostnames = CrawlerConfig('Hostnames', self.configfile) self.url = self.hostnames.get('fx') self.password = self.url.split('.')[0] self.URL = 'https://' + self.url self.FEED_URLS = [self.URL] self.config = CrawlerConfig("ContentAll", self.configfile) self.feedcrawler = CrawlerConfig("FeedCrawler", self.configfile) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug self.scraper = scraper self.filename = filename self.pattern = False self.db = FeedDb(self.dbfile, 'FeedCrawler') self.hevc_retail = self.config.get("hevc_retail") self.retail_only = self.config.get("retail_only") self.hosters = CrawlerConfig("Hosters", configfile).get_section() self.hoster_fallback = self.config.get("hoster_fallback") self.prefer_dw_mirror = self.feedcrawler.get("prefer_dw_mirror") search = int( CrawlerConfig("ContentAll", self.configfile).get("search")) i = 2 while i <= search: page_url = self.URL + "/page/" + str(i) if page_url not in self.FEED_URLS: self.FEED_URLS.append(page_url) i += 1 self.cdc = FeedDb(self.dbfile, 'cdc') self.last_set_all = self.cdc.retrieve("ALLSet-" + self.filename) self.headers = { 'If-Modified-Since': str(self.cdc.retrieve(self._SITE + "Headers-" + self.filename)) } self.last_sha = self.cdc.retrieve(self._SITE + "-" + self.filename) settings = [ "quality", "search", "ignore", "regex", "cutoff", "enforcedl", "crawlseasons", "seasonsquality", "seasonpacks", "seasonssource", "imdbyear", "imdb", "hevc_retail", "retail_only", "hoster_fallback" ] self.settings = [] self.settings.append(self.feedcrawler.get("english")) self.settings.append(self.feedcrawler.get("surround")) self.settings.append(self.feedcrawler.get("prefer_dw_mirror")) self.settings.append(self.hosters) for s in settings: self.settings.append(self.config.get(s)) self.search_imdb_done = False self.search_regular_done = False self.dl_unsatisfied = False self.get_feed_method = fx_feed_enricher self.get_url_method = get_url self.get_url_headers_method = get_url_headers self.get_download_links_method = fx_get_download_links self.download_method = myjd_download try: self.imdb = float(self.config.get('imdb')) except: self.imdb = 0.0 def periodical_task(self): self.device = shared_blogs.periodical_task(self) return self.device
def check_url(configfile, dbfile, scraper=False): hostnames = CrawlerConfig('Hostnames', configfile) sj = hostnames.get('sj') dj = hostnames.get('dj') sf = hostnames.get('sf') by = hostnames.get('by') dw = hostnames.get('dw') fx = hostnames.get('fx') nk = hostnames.get('nk') ww = hostnames.get('ww') dd = hostnames.get('dd') if not scraper: scraper = cloudscraper.create_scraper() sj_url = 'https://' + sj dj_url = 'https://' + dj sf_url = 'https://' + sf by_url = 'https://' + by dw_url = 'https://' + dw fx_url = 'https://' + fx nk_url = 'https://' + nk ww_url = 'https://' + ww dd_url = 'https://' + dd sj_blocked_proxy = False dj_blocked_proxy = False sf_blocked_proxy = False by_blocked_proxy = False dw_blocked_proxy = False fx_blocked_proxy = False nk_blocked_proxy = False ww_blocked_proxy = False dd_blocked_proxy = False sj_blocked = False dj_blocked = False sf_blocked = False by_blocked = False dw_blocked = False fx_blocked = False nk_blocked = False ww_blocked = False dd_blocked = False db = FeedDb(dbfile, 'proxystatus') db.delete("SJ") db.delete("DJ") db.delete("SF") db.delete("BY") db.delete("DW") db.delete("FX") db.delete("NK") db.delete("WW") db.delete("DD") db_normal = FeedDb(dbfile, 'normalstatus') db_normal.delete("SJ") db_normal.delete("DJ") db_normal.delete("SF") db_normal.delete("BY") db_normal.delete("DW") db_normal.delete("FX") db_normal.delete("NK") db_normal.delete("WW") db_normal.delete("DD") proxy = CrawlerConfig('FeedCrawler', configfile).get('proxy') fallback = CrawlerConfig('FeedCrawler', configfile).get('fallback') if proxy: proxies = {'http': proxy, 'https': proxy} if not sj: db.store("SJ", "Blocked") else: try: if "block." in str( scraper.get( sj_url, proxies=proxies, timeout=30, allow_redirects=False).headers.get("location")): sj_blocked_proxy = True else: db.delete("SJ") except: sj_blocked_proxy = True if sj_blocked_proxy: print( u"Der Zugriff auf SJ ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("SJ", "Blocked") scraper = cloudscraper.create_scraper() if not dj: db.store("DJ", "Blocked") else: try: if "block." in str( scraper.get( dj_url, proxies=proxies, timeout=30, allow_redirects=False).headers.get("location")): dj_blocked_proxy = True else: db.delete("DJ") except: dj_blocked_proxy = True if dj_blocked_proxy: print( u"Der Zugriff auf DJ ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("DJ", "Blocked") scraper = cloudscraper.create_scraper() if not sf: db.store("SF", "Blocked") else: try: delta = datetime.datetime.now().strftime("%Y-%m-%d") sf_test = scraper.get(sf_url + '/updates/' + delta, proxies=proxies, timeout=30, allow_redirects=False) if not sf_test.text or sf_test.status_code is not ( 200 or 304) or '<h3><a href="/' not in sf_test.text: sf_blocked_proxy = True else: db.delete("SF") except: sf_blocked_proxy = True if sf_blocked_proxy: print( u"Der Zugriff auf SF ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("SF", "Blocked") scraper = cloudscraper.create_scraper() if not by: db.store("BY", "Blocked") else: try: if scraper.get(by_url, proxies=proxies, timeout=30, allow_redirects=False).status_code == 403: by_blocked_proxy = True else: db.delete("BY") except: by_blocked_proxy = True if by_blocked_proxy: print( u"Der Zugriff auf BY ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("BY", "Blocked") scraper = cloudscraper.create_scraper() if not dw: db.store("DW", "Blocked") else: try: dw_test = scraper.get(dw_url + "/downloads/hauptkategorie/movies/", proxies=proxies, timeout=30, allow_redirects=False) if not dw_test.text or dw_test.status_code is not ( 200 or 304 ) or '<a id="first_element" href=' not in dw_test.text: dw_blocked_proxy = True else: db.delete("DW") except: dw_blocked_proxy = True if dw_blocked_proxy: print( u"Der Zugriff auf DW ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("DW", "Blocked") scraper = cloudscraper.create_scraper() if not fx: db.store("FX", "Blocked") else: try: if scraper.get(fx_url, proxies=proxies, timeout=30, allow_redirects=False).status_code == 403: fx_blocked_proxy = True else: db.delete("FX") except: fx_blocked_proxy = True if fx_blocked_proxy: print( u"Der Zugriff auf FX ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("FX", "Blocked") scraper = cloudscraper.create_scraper() if not nk: db.store("NK", "Blocked") else: try: if scraper.get(nk_url, proxies=proxies, timeout=30, allow_redirects=False).status_code == 403: nk_blocked_proxy = True else: db.delete("NK") except: nk_blocked_proxy = True if nk_blocked_proxy: print( u"Der Zugriff auf NK ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("NK", "Blocked") scraper = cloudscraper.create_scraper() if not ww: db.store("WW", "Blocked") else: try: ww_test = scraper.post(ww_url + "/ajax", data="p=1&t=l&q=1", proxies=proxies, timeout=30, allow_redirects=False) if not ww_test.text or ww_test.status_code is not ( 200 or 304) or '<span class="main-rls">' not in ww_test.text: ww_blocked_proxy = True else: db.delete("WW") except: ww_blocked_proxy = True if ww_blocked_proxy: print( u"Der Zugriff auf WW ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("WW", "Blocked") scraper = cloudscraper.create_scraper() if not dd: db.store("DD", "Blocked") else: try: if scraper.get(dd_url, proxies=proxies, timeout=30, allow_redirects=False).status_code == 403: dd_blocked_proxy = True else: db.delete("DD") except: dd_blocked_proxy = True if dd_blocked_proxy: print( u"Der Zugriff auf DD ist mit der aktuellen Proxy-IP nicht möglich!" ) db.store("DD", "Blocked") scraper = cloudscraper.create_scraper() if not proxy or (proxy and sj_blocked_proxy and fallback): if not sj: db.store("SJ", "Blocked") else: try: if "block." in str( scraper.get( sj_url, timeout=30, allow_redirects=False).headers.get("location")): sj_blocked = True except: sj_blocked = True if sj_blocked: db_normal.store("SJ", "Blocked") print( u"Der Zugriff auf SJ ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and dj_blocked_proxy and fallback): if not dj: db.store("DJ", "Blocked") else: try: if "block." in str( scraper.get( dj_url, timeout=30, allow_redirects=False).headers.get("location")): dj_blocked = True except: dj_blocked = True if dj_blocked: db_normal.store("DJ", "Blocked") print( u"Der Zugriff auf DJ ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and sf_blocked_proxy and fallback): if not sf: db.store("SF", "Blocked") else: try: delta = datetime.datetime.now().strftime("%Y-%m-%d") sf_test = scraper.get(sf_url + '/updates/' + delta, timeout=30, allow_redirects=False) if not sf_test.text or sf_test.status_code is not ( 200 or 304) or '<h3><a href="/' not in sf_test.text: sf_blocked = True except: sf_blocked = True if sf_blocked: db_normal.store("SF", "Blocked") print( u"Der Zugriff auf SF ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and by_blocked_proxy and fallback): if not by: db.store("BY", "Blocked") else: try: if scraper.get(by_url, timeout=30, allow_redirects=False).status_code == 403: by_blocked = True except: by_blocked = True if by_blocked: db_normal.store("BY", "Blocked") print( u"Der Zugriff auf BY ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and dw_blocked_proxy and fallback): if not dw: db.store("DW", "Blocked") else: try: dw_test = scraper.get(dw_url + "/downloads/hauptkategorie/movies/", timeout=30, allow_redirects=False) if not dw_test.text or dw_test.status_code is not ( 200 or 304 ) or '<a id="first_element" href=' not in dw_test.text: dw_blocked = True except: dw_blocked = True if dw_blocked: db_normal.store("DW", "Blocked") print( u"Der Zugriff auf DW ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and fx_blocked_proxy and fallback): if not fx: db.store("FX", "Blocked") else: try: if scraper.get(fx_url, timeout=30, allow_redirects=False).status_code == 403: fx_blocked = True except: fx_blocked = True if fx_blocked: db_normal.store("FX", "Blocked") print( u"Der Zugriff auf FX ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and nk_blocked_proxy and fallback): if not nk: db.store("NK", "Blocked") else: try: if scraper.get(nk_url, timeout=30, allow_redirects=False).status_code == 403: nk_blocked = True except: nk_blocked = True if nk_blocked: db_normal.store("NK", "Blocked") print( u"Der Zugriff auf NK ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and ww_blocked_proxy and fallback): if not ww: db.store("WW", "Blocked") else: try: ww_test = scraper.post(ww_url + "/ajax", data="p=1&t=l&q=1", timeout=30, allow_redirects=False) if not ww_test.text or ww_test.status_code is not ( 200 or 304) or '<span class="main-rls">' not in ww_test.text: ww_blocked = True except: ww_blocked = True if ww_blocked: db_normal.store("WW", "Blocked") print( u"Der Zugriff auf WW ist mit der aktuellen IP nicht möglich!" ) if not proxy or (proxy and dd_blocked_proxy and fallback): if not dd: db.store("DD", "Blocked") else: try: if scraper.get(dd_url, timeout=30, allow_redirects=False).status_code == 403: dd_blocked = True except: dd_blocked = True if dd_blocked: db_normal.store("DD", "Blocked") print( u"Der Zugriff auf DD ist mit der aktuellen IP nicht möglich!" ) return scraper
class DD: _SITE = 'DD' def __init__(self, configfile, dbfile, device, logging, scraper): self.configfile = configfile self.dbfile = dbfile self.device = device self.config = CrawlerConfig("CustomDD", self.configfile) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug self.scraper = scraper self.db = FeedDb(self.dbfile, 'FeedCrawler') def periodical_task(self): feeds = self.config.get("feeds") if feeds: added_items = [] feeds = feeds.replace(" ", "").split(',') for feed in feeds: feed = feedparser.parse( get_url(feed, self.configfile, self.dbfile, self.scraper)) for post in feed.entries: key = post.title.replace(" ", ".") epoch = datetime(1970, 1, 1) current_epoch = int(time()) published_format = "%Y-%m-%d %H:%M:%S+00:00" published_timestamp = str(parser.parse(post.published)) published_epoch = int((datetime.strptime( published_timestamp, published_format) - epoch).total_seconds()) if (current_epoch - 1800) > published_epoch: link_pool = post.summary unicode_links = re.findall(r'(http.*)', link_pool) links = [] for link in unicode_links: if check_hoster(link, self.configfile): links.append(str(link)) if self.config.get("hoster_fallback") and not links: for link in unicode_links: links.append(str(link)) storage = self.db.retrieve_all(key) if not links: if 'added' not in storage and 'notdl' not in storage: wrong_hoster = '[' + self._SITE + '/Hoster fehlt] - ' + key if 'wrong_hoster' not in storage: print(wrong_hoster) self.db.store(key, 'wrong_hoster') notify([wrong_hoster], self.configfile) else: self.log_debug(wrong_hoster) elif 'added' in storage: self.log_debug( "%s - Release ignoriert (bereits gefunden)" % key) else: self.device = myjd_download( self.configfile, self.dbfile, self.device, key, "FeedCrawler", links, "") if self.device: self.db.store(key, 'added') log_entry = '[Englisch] - ' + key + ' - [' + self._SITE + ']' self.log_info(log_entry) notify([log_entry], self.configfile) added_items.append(log_entry) else: self.log_debug( "%s - Releasezeitpunkt weniger als 30 Minuten in der Vergangenheit - wird ignoriert." % key) else: self.log_debug("Liste ist leer. Stoppe Suche für " + self._SITE + "!") return self.device
def download(configfile, dbfile, device, title, subdir, old_links, password, full_path=None, autostart=False): try: if not device or not is_device(device): device = get_device(configfile) if isinstance(old_links, list): links = [] for link in old_links: if link not in links: links.append(link) else: links = [old_links] links = str(links).replace(" ", "") crawljobs = CrawlerConfig('Crawljobs', configfile) usesubdir = crawljobs.get("subdir") priority = "DEFAULT" if full_path: path = full_path else: if usesubdir: path = subdir + "/<jd:packagename>" else: path = "<jd:packagename>" if "Remux" in path: priority = "LOWER" try: device.linkgrabber.add_links(params=[ { "autostart": autostart, "links": links, "packageName": title, "extractPassword": password, "priority": priority, "downloadPassword": password, "destinationFolder": path, "comment": "FeedCrawler by rix1337", "overwritePackagizerRules": False }]) except feedcrawler.myjdapi.TokenExpiredException: device = get_device(configfile) if not device or not is_device(device): return False device.linkgrabber.add_links(params=[ { "autostart": autostart, "links": links, "packageName": title, "extractPassword": password, "priority": priority, "downloadPassword": password, "destinationFolder": path, "comment": "FeedCrawler by rix1337", "overwritePackagizerRules": False }]) db = FeedDb(dbfile, 'crawldog') if db.retrieve(title): db.delete(title) db.store(title, 'retried') else: db.store(title, 'added') return device except feedcrawler.myjdapi.MYJDException as e: print(u"Fehler bei der Verbindung mit MyJDownloader: " + str(e)) return False
def crawler(configfile, dbfile, device, feedcrawler, log_level, log_file, log_format): sys.stdout = Unbuffered(sys.stdout) logger = logging.getLogger('feedcrawler') logger.setLevel(log_level) console = logging.StreamHandler(stream=sys.stdout) formatter = logging.Formatter(log_format) console.setLevel(log_level) logfile = logging.handlers.RotatingFileHandler(log_file) logfile.setFormatter(formatter) logfile.setLevel(logging.INFO) logger.addHandler(logfile) logger.addHandler(console) if log_level == 10: logfile_debug = logging.handlers.RotatingFileHandler( log_file.replace("FeedCrawler.log", "FeedCrawler_DEBUG.log")) logfile_debug.setFormatter(formatter) logfile_debug.setLevel(10) logger.addHandler(logfile_debug) disable_request_warnings(InsecureRequestWarning) log_debug = logger.debug ombi_first_launch = True crawltimes = FeedDb(dbfile, "crawltimes") arguments = docopt(__doc__, version='FeedCrawler') while True: try: if not device or not is_device(device): device = get_device(configfile) FeedDb(dbfile, 'cached_requests').reset() FeedDb(dbfile, 'cached_requests').cleanup() scraper = check_url(configfile, dbfile) start_time = time.time() crawltimes.update_store("active", "True") crawltimes.update_store("start_time", start_time * 1000) log_debug("--------Alle Suchfunktion gestartet.--------") requested_movies = 0 requested_shows = 0 ombi_string = "" if device: ombi_results = ombi(configfile, dbfile, device, log_debug, ombi_first_launch) device = ombi_results[0] ombi_results = ombi_results[1] requested_movies = ombi_results[0] requested_shows = ombi_results[1] ombi_first_launch = False if requested_movies or requested_shows: ombi_string = u"Die Ombi-Suche lief für: " if requested_movies: ombi_string = ombi_string + str( requested_movies) + " Filme" if requested_shows: ombi_string = ombi_string + " und " if requested_shows: ombi_string = ombi_string + str( requested_shows) + " Serien" for task in search_pool(configfile, dbfile, device, logger, scraper): name = task._SITE try: file = " - Liste: " + task.filename except AttributeError: file = "" log_debug("-----------Suchfunktion (" + name + file + ") gestartet!-----------") device = task.periodical_task() log_debug("-----------Suchfunktion (" + name + file + ") ausgeführt!-----------") cached_requests = FeedDb(dbfile, 'cached_requests').count() request_cache_string = u"Der FeedCrawler-Cache hat " + str( cached_requests) + " HTTP-Requests gespart!" end_time = time.time() total_time = end_time - start_time interval = int(feedcrawler.get('interval')) * 60 random_range = random.randrange(0, interval // 4) wait = interval + random_range next_start = end_time + wait log_debug( time.strftime("%Y-%m-%d %H:%M:%S") + " - Alle Suchfunktion ausgeführt (Dauer: " + readable_time(total_time) + u")!") if ombi_string: log_debug( time.strftime("%Y-%m-%d %H:%M:%S") + u" - " + ombi_string) log_debug( time.strftime("%Y-%m-%d %H:%M:%S") + u" - " + request_cache_string) log_debug("-----------Wartezeit bis zum nächsten Suchlauf: " + readable_time(wait) + '-----------') ombi_string = "" print( time.strftime("%Y-%m-%d %H:%M:%S") + u" - Alle Suchfunktion ausgeführt (Dauer: " + readable_time(total_time) + u")!", ombi_string + " - " + request_cache_string if ombi_string else request_cache_string) print(u"-----------Wartezeit bis zum nächsten Suchlauf: " + readable_time(wait) + '-----------') crawltimes.update_store("end_time", end_time * 1000) crawltimes.update_store("total_time", readable_time(total_time)) crawltimes.update_store("next_start", next_start * 1000) crawltimes.update_store("active", "False") FeedDb(dbfile, 'cached_requests').reset() FeedDb(dbfile, 'cached_requests').cleanup() if arguments['--testlauf']: log_debug(u"-----------Testlauf beendet!-----------") print(u"-----------Testlauf beendet!-----------") return wait_chunks = wait // 10 start_now_triggered = False while wait_chunks: time.sleep(10) if FeedDb(dbfile, 'crawltimes').retrieve("startnow"): FeedDb(dbfile, 'crawltimes').delete("startnow") start_now_triggered = True break wait_chunks -= 1 if start_now_triggered: log_debug("----------Wartezeit vorzeitig beendet----------") else: log_debug("-------------Wartezeit verstrichen-------------") except Exception: traceback.print_exc() time.sleep(10)
def check_valid_release(title, retail_only, hevc_retail, dbfile): if retail_only: if not is_retail(title, False): return False if ".German" in title: search_title = title.split(".German")[0] elif ".GERMAN" in title: search_title = title.split(".GERMAN")[0] else: try: quality = re.findall(r"\d{3,4}p", title)[0] search_title = title.split(quality)[0] except: return True db = FeedDb(dbfile, 'FeedCrawler') is_episode = re.findall(r'.*\.s\d{1,3}(e\d{1,3}|e\d{1,3}-.*\d{1,3})\..*', title, re.IGNORECASE) if is_episode: episode_name = re.findall(r'.*\.s\d{1,3}e\d{1,3}(\..*)', search_title, re.IGNORECASE) if episode_name: search_title = search_title.replace(episode_name[0], "") season_search_title = search_title.replace(is_episode[0], "") + "." season_results = db.retrieve_all_beginning_with(season_search_title) results = db.retrieve_all_beginning_with(search_title) + season_results else: db = FeedDb(dbfile, 'FeedCrawler') results = db.retrieve_all_beginning_with(search_title) if not results: return True bluray_tags = [ ".bd-rip.", ".br-rip.", ".bluray-rip.", ".bluray.", ".bd-disk.", ".bd.", ".bd5.", ".bd9.", ".bd25.", ".bd50." ] web_tags = [ ".web.", ".web-rip.", ".webrip.", ".vod-rip.", ".webdl.", ".web-dl.", ".ddc." ] trash_tags = [ ".cam.", ".cam-rip.", ".ts.", ".telesync.", ".wp.", ".workprint.", ".tc.", ".telecine.", ".vhs-rip.", ".tv-rip.", ".hdtv-rip.", ".hdtv.", ".tvrip.", ".hdtvrip.", ".sat-rip.", ".dvb-rip.", ".ds-rip.", ".scr.", ".screener.", ".dvdscr.", ".dvdscreener.", ".bdscr.", ".r5.", ".dvdrip.", ".dvd." ] unknown = [] trash = [] web = [] bluray = [] retail = [] # Get all previously found Releases and categorize them by their tags for r in results: if any(s in r.lower() for s in bluray_tags): if is_retail(r, False): retail.append(r) else: bluray.append(r) elif any(s in r.lower() for s in web_tags): web.append(r) elif any(s in r.lower() for s in trash_tags): trash.append(r) else: unknown.append(r) # Categorize the current Release by its tag to check if a release of the same or better category was already found # If no release is in the higher category, propers are allowed anytime # If no HEVC is available in the current category or higher and the current release is HEVC, it will be allowed if any(s in title.lower() for s in bluray_tags): if is_retail(r, False): if len(retail) > 0: if hevc_retail: if is_hevc(title): no_hevc = True for r in retail: if is_hevc(r): no_hevc = False if no_hevc: return True if ".proper" in title.lower(): return True return False else: if len(retail) == 0 and len(bluray) > 0: if ".proper" in title.lower(): return True if len(retail) > 0 or len(bluray) > 0: if hevc_retail: if is_hevc(title): no_hevc = True for r in retail + bluray: if is_hevc(r): no_hevc = False if no_hevc: return True return False elif any(s in title.lower() for s in web_tags): if len(retail) == 0 and len(bluray) == 0 and len(web) > 0: if ".proper" in title.lower(): return True if len(retail) > 0 or len(bluray) > 0 or len(web) > 0: if hevc_retail: if is_hevc(title): no_hevc = True for r in retail + bluray + web: if is_hevc(r): no_hevc = False if no_hevc: return True return False elif any(s in title.lower() for s in trash_tags): if len(retail) == 0 and len(bluray) == 0 and len( web) == 0 and len(trash) > 0: if ".proper" in title.lower(): return True if len(retail) > 0 or len(bluray) > 0 or len(web) > 0 or len( trash) > 0: return False else: if len(retail) == 0 and len(bluray) == 0 and len(web) == 0 and len( trash) == 0 and len(unknown) > 0: if ".proper" in title.lower(): return True if len(retail) > 0 or len(bluray) > 0 or len(web) > 0 or len( trash) > 0 or len(unknown) > 0: return False return True
def get_redirected_url(url, configfile, dbfile, scraper=False): config = CrawlerConfig('FeedCrawler', configfile) proxy = config.get('proxy') if not scraper: scraper = cloudscraper.create_scraper() db = FeedDb(dbfile, 'proxystatus') db_normal = FeedDb(dbfile, 'normalstatus') site = check_is_site(url, configfile) if proxy: try: if site and "SJ" in site: if db.retrieve("SJ"): if config.get("fallback") and not db_normal.retrieve("SJ"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "DJ" in site: if db.retrieve("DJ"): if config.get("fallback") and not db_normal.retrieve("DJ"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "SF" in site: if db.retrieve("SF"): if config.get("fallback") and not db_normal.retrieve("SF"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "BY" in site: if db.retrieve("BY"): if config.get("fallback") and not db_normal.retrieve("BY"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "DW" in site: if db.retrieve("DW"): if config.get("fallback") and not db_normal.retrieve("DW"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "FX" in site: if db.retrieve("FX"): if config.get("fallback") and not db_normal.retrieve("FX"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "NK" in site: if db.retrieve("NK"): if config.get("fallback") and not db_normal.retrieve("NK"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url elif site and "WW" in site: return url elif site and "DD" in site: if db.retrieve("DD"): if config.get("fallback") and not db_normal.retrieve("DD"): return scraper.get( url, allow_redirects=False, timeout=30).headers._store["location"][1] else: return url proxies = {'http': proxy, 'https': proxy} response = scraper.get(url, allow_redirects=False, proxies=proxies, timeout=30).headers._store["location"][1] return response except Exception as e: print(u"Fehler beim Abruf von: " + url + " " + str(e)) return url else: try: if site and "SJ" in site and db_normal.retrieve("SJ"): return url elif site and "DJ" in site and db_normal.retrieve("DJ"): return url elif site and "SF" in site and db_normal.retrieve("SF"): return url elif site and "BY" in site and db_normal.retrieve("BY"): return url elif site and "DW" in site and db_normal.retrieve("DW"): return url elif site and "FX" in site and db_normal.retrieve("FX"): return url elif site and "NK" in site and db_normal.retrieve("NK"): return url elif site and "WW" in site: return url elif site and "DD" in site and db_normal.retrieve("DD"): return url response = scraper.get(url, allow_redirects=False, timeout=30).headers._store["location"][1] return response except Exception as e: print(u"Fehler beim Abruf von: " + url + " " + str(e)) return url
def post_url_headers(url, configfile, dbfile, headers, data, scraper=False): config = CrawlerConfig('FeedCrawler', configfile) proxy = config.get('proxy') if not scraper: scraper = cloudscraper.create_scraper() db = FeedDb(dbfile, 'proxystatus') db_normal = FeedDb(dbfile, 'normalstatus') site = check_is_site(url, configfile) if proxy: try: if site and "SJ" in site: if db.retrieve("SJ"): if config.get("fallback") and not db_normal.retrieve("SJ"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "DJ" in site: if db.retrieve("DJ"): if config.get("fallback") and not db_normal.retrieve("DJ"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "SF" in site: if db.retrieve("SF"): if config.get("fallback") and not db_normal.retrieve("SF"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "BY" in site: if db.retrieve("BY"): if config.get("fallback") and not db_normal.retrieve("BY"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "DW" in site: if db.retrieve("DW"): if config.get("fallback") and not db_normal.retrieve("DW"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "FX" in site: if db.retrieve("FX"): if config.get("fallback") and not db_normal.retrieve("FX"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "NK" in site: if db.retrieve("NK"): if config.get("fallback") and not db_normal.retrieve("NK"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "WW" in site: if db.retrieve("WW"): if config.get("fallback") and not db_normal.retrieve("WW"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] elif site and "DD" in site: if db.retrieve("DD"): if config.get("fallback") and not db_normal.retrieve("DD"): return [ scraper.post(url, data, headers=headers, timeout=30), scraper ] else: return ["", scraper] proxies = {'http': proxy, 'https': proxy} response = scraper.post(url, data, headers=headers, proxies=proxies, timeout=30) return [response, scraper] except Exception as e: print(u"Fehler beim Abruf von: " + url + " " + str(e)) return ["", scraper] else: try: if site and "SJ" in site and db_normal.retrieve("SJ"): return ["", scraper] elif site and "DJ" in site and db_normal.retrieve("DJ"): return ["", scraper] elif site and "SF" in site and db_normal.retrieve("SF"): return ["", scraper] elif site and "BY" in site and db_normal.retrieve("BY"): return ["", scraper] elif site and "DW" in site and db_normal.retrieve("DW"): return ["", scraper] elif site and "FX" in site and db_normal.retrieve("FX"): return ["", scraper] elif site and "NK" in site and db_normal.retrieve("NK"): return ["", scraper] elif site and "WW" in site and db_normal.retrieve("WW"): return ["", scraper] elif site and "DD" in site and db_normal.retrieve("DD"): return ["", scraper] response = scraper.post(url, data, headers=headers, timeout=30) return [response, scraper] except Exception as e: print(u"Fehler beim Abruf von: " + url + " " + str(e)) return ["", scraper]
def __init__(self, configfile, dbfile, device, logging, scraper, filename): self.configfile = configfile self.dbfile = dbfile self.device = device self.hostnames = CrawlerConfig('Hostnames', self.configfile) self.url = self.hostnames.get('dw') self.password = self.url.split('.')[0] if "List_ContentAll_Seasons" not in filename: self.URL = 'https://' + self.url + "/downloads/hauptkategorie/movies/" else: self.URL = 'https://' + self.url + "/downloads/hauptkategorie/serien/" self.FEED_URLS = [self.URL] self.config = CrawlerConfig("ContentAll", self.configfile) self.feedcrawler = CrawlerConfig("FeedCrawler", self.configfile) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug self.scraper = scraper self.filename = filename self.pattern = False self.db = FeedDb(self.dbfile, 'FeedCrawler') self.hevc_retail = self.config.get("hevc_retail") self.retail_only = self.config.get("retail_only") self.hosters = CrawlerConfig("Hosters", configfile).get_section() self.hoster_fallback = self.config.get("hoster_fallback") self.prefer_dw_mirror = self.feedcrawler.get("prefer_dw_mirror") search = int( CrawlerConfig("ContentAll", self.configfile).get("search")) i = 2 while i <= search: page_url = self.URL + "order/zeit/sort/D/seite/" + str(i) + "/" if page_url not in self.FEED_URLS: self.FEED_URLS.append(page_url) i += 1 self.cdc = FeedDb(self.dbfile, 'cdc') self.last_set_all = self.cdc.retrieve("ALLSet-" + self.filename) self.headers = { 'If-Modified-Since': str(self.cdc.retrieve(self._SITE + "Headers-" + self.filename)) } self.last_sha = self.cdc.retrieve(self._SITE + "-" + self.filename) settings = [ "quality", "search", "ignore", "regex", "cutoff", "enforcedl", "crawlseasons", "seasonsquality", "seasonpacks", "seasonssource", "imdbyear", "imdb", "hevc_retail", "retail_only", "hoster_fallback" ] self.settings = [] self.settings.append(self.feedcrawler.get("english")) self.settings.append(self.feedcrawler.get("surround")) self.settings.append(self.feedcrawler.get("prefer_dw_mirror")) self.settings.append(self.hosters) for s in settings: self.settings.append(self.config.get(s)) self.search_imdb_done = False self.search_regular_done = False self.dl_unsatisfied = False self.get_feed_method = dw_feed_enricher self.get_url_method = get_url self.get_url_headers_method = get_url_headers self.get_download_links_method = dw_get_download_links self.download_method = add_decrypt_instead_of_download try: self.imdb = float(self.config.get('imdb')) except: self.imdb = 0.0
def crawldog(configfile, dbfile): disable_request_warnings(InsecureRequestWarning) crawljobs = CrawlerConfig('Crawljobs', configfile) autostart = crawljobs.get("autostart") db = FeedDb(dbfile, 'crawldog') grabber_was_collecting = False grabber_collecting = False device = False while True: try: if not device or not is_device(device): device = get_device(configfile) myjd_packages = get_info(configfile, device) if myjd_packages: grabber_collecting = myjd_packages[2] if grabber_was_collecting or grabber_collecting: grabber_was_collecting = grabber_collecting time.sleep(5) else: packages_in_downloader_decrypted = myjd_packages[4][0] packages_in_linkgrabber_decrypted = myjd_packages[4][1] offline_packages = myjd_packages[4][2] encrypted_packages = myjd_packages[4][3] try: watched_titles = db.retrieve_all_titles() except: watched_titles = False notify_list = [] if packages_in_downloader_decrypted or packages_in_linkgrabber_decrypted or offline_packages or encrypted_packages: if watched_titles: for title in watched_titles: if packages_in_downloader_decrypted: for package in packages_in_downloader_decrypted: if title[0] in package[ 'name'] or title[0].replace( ".", " ") in package['name']: check = hoster_check( configfile, device, [package], title[0], [0]) device = check[0] if device: db.delete(title[0]) if packages_in_linkgrabber_decrypted: for package in packages_in_linkgrabber_decrypted: if title[0] in package[ 'name'] or title[0].replace( ".", " ") in package['name']: check = hoster_check( configfile, device, [package], title[0], [0]) device = check[0] episode = FeedDb( dbfile, 'episode_remover').retrieve( title[0]) if episode: filenames = package[ 'filenames'] if len(filenames) > 1: fname_episodes = [] for fname in filenames: try: if re.match( r'.*S\d{1,3}E\d{1,3}.*', fname, flags=re. IGNORECASE ): fname = re.findall( r'S\d{1,3}E(\d{1,3})', fname, flags=re. IGNORECASE ).pop() else: fname = fname.replace( "hddl8", "" ).replace( "dd51", "" ).replace( "264", "" ).replace( "265", "") except: fname = fname.replace( "hddl8", "").replace( "dd51", "" ).replace( "264", "" ).replace( "265", "") fname_episode = "".join( re.findall( r'\d+', fname.split( ".part") [0])) try: fname_episodes.append( str( int(fname_episode ))) except: pass replacer = longest_substr( fname_episodes) new_fname_episodes = [] for new_ep_fname in fname_episodes: try: new_fname_episodes.append( str( int( new_ep_fname . replace( replacer, "" )))) except: pass replacer = longest_substr( new_fname_episodes) newer_fname_episodes = [] for new_ep_fname in new_fname_episodes: try: newer_fname_episodes.append( str( int( re.sub( replacer, "", new_ep_fname, 1)) )) except: pass replacer = longest_substr( newer_fname_episodes) even_newer_fname_episodes = [] for newer_ep_fname in newer_fname_episodes: try: even_newer_fname_episodes.append( str( int( re.sub( replacer, "", newer_ep_fname, 1)) )) except: pass if even_newer_fname_episodes: fname_episodes = even_newer_fname_episodes elif newer_fname_episodes: fname_episodes = newer_fname_episodes elif new_fname_episodes: fname_episodes = new_fname_episodes delete_linkids = [] pos = 0 for delete_id in package[ 'linkids']: if str(episode) != str( fname_episodes[ pos]): delete_linkids.append( delete_id) pos += 1 if delete_linkids: delete_uuids = [ package['uuid'] ] FeedDb( dbfile, 'episode_remover' ).delete(title[0]) device = remove_from_linkgrabber( configfile, device, delete_linkids, delete_uuids) if autostart: device = move_to_downloads( configfile, device, package['linkids'], [package['uuid']]) if device: db.delete(title[0]) if offline_packages: for package in offline_packages: if title[0] in package[ 'name'] or title[0].replace( ".", " ") in package['name']: notify_list.append("[Offline] - " + title[0]) print((u"[Offline] - " + title[0])) db.delete(title[0]) if encrypted_packages: for package in encrypted_packages: if title[0] in package[ 'name'] or title[0].replace( ".", " ") in package['name']: if title[1] == 'added': if retry_decrypt( configfile, dbfile, device, package['linkids'], [package['uuid']], package['urls']): db.delete(title[0]) db.store( title[0], 'retried') else: add_decrypt( package['name'], package['url'], "", dbfile) device = remove_from_linkgrabber( configfile, device, package['linkids'], [package['uuid']]) notify_list.append( "[Click'n'Load notwendig] - " + title[0]) print( u"[Click'n'Load notwendig] - " + title[0]) db.delete(title[0]) else: if not grabber_collecting: db.reset() if notify_list: notify(notify_list, configfile) time.sleep(30) else: print( u"Scheinbar ist der JDownloader nicht erreichbar - bitte prüfen und neustarten!" ) except Exception: traceback.print_exc() time.sleep(30)