def downloadWebpage(self): print 'Trying to get website information...please wait...' cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.dat' if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: print "Use local file..." webpage = urlopen(cache) else: if serieslinks.has_key(self.name): title = serieslinks[self.name] else: title = self.name.replace(' ', '-') webpage = urlopen('http://www.fernsehserien.de/' + title + '/episodenguide').read() if not (os.path.isdir(self.name.replace('-', ' '))): os.mkdir(self.name.replace('-', ' ')) if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) f = open(cache, 'w') f.write(webpage) f.close() print 'Website successfully scraped' self.soupobj = BeautifulSoup(webpage, "html.parser")
def downloadWebpage(self): print 'Trying to get website information...please wait...' cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.dat' if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: print "Use local file..." webpage = urlopen(cache) else: if serieslinks.has_key(self.name): title = serieslinks[self.name] else: title = self.name.replace(' ','-') webpage = urlopen('http://www.fernsehserien.de/'+title+'/episodenguide').read() if not(os.path.isdir(self.name.replace('-',' '))): os.mkdir(self.name.replace('-',' ')) if not(os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) f = open(cache,'w') f.write(webpage) f.close() print 'Website successfully scraped' self.soupobj = BeautifulSoup(webpage, "html.parser")
def getTimeTable(self, sender): print 'Trying to get timetable information...please wait...' if senderlinks.has_key(sender): senderlink = senderlinks[sender] else: print 'Link zu Sender ' + sender + ' nicht gefunden' return 0 cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.dat' if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: print "Use local file..." webpage = urlopen(cache) else: if serieslinks.has_key(self.name.replace(' ', '-')): title = serieslinks[self.name.replace(' ', '-')] else: title = self.name.replace(' ', '-') webpage = urlopen('http://www.fernsehserien.de/' + title + '/sendetermine/' + senderlink + '/-1').read() if not (os.path.isdir(self.name)): os.mkdir(self.name) if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) f = open(cache, 'w') f.write(webpage) f.close() print 'Website successfully scraped' #soup = BeautifulSoup(fernsehserien_testdata.gethtmlo(), "html.parser") soup = BeautifulSoup(webpage, "html.parser") tddata = soup.select("tr") epdate, eptime, season, episode, title = [], [], [], [], [] for index, item in enumerate(tddata): if fmod(index, 2) != 0 and index > 0: #print item.text m = re.search( "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?>(\d{1,3})<.*?>(\d{1,2}).*?>(\d{1,2}).*?>([^<]+)", str(item)) if type(m) is not NoneType: epdate.append(m.group(1)) eptime.append(m.group(2)) season.append(m.group(4)) episode.append(m.group(5)) title.append(m.group(6)) return (epdate, season, episode, title, eptime)
def getTimeTable(self, sender): print 'Trying to get timetable information...please wait...' if senderlinks.has_key(sender): senderlink = senderlinks[sender] else: print 'Link zu Sender ' + sender +' nicht gefunden' return 0 cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.dat' if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: print "Use local file..." webpage = urlopen(cache) else: if serieslinks.has_key(self.name.replace(' ','-')): title = serieslinks[self.name.replace(' ','-')] else: title = self.name.replace(' ','-') webpage = urlopen('http://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1').read() if not(os.path.isdir(self.name)): os.mkdir(self.name) if not(os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) f = open(cache,'w') f.write(webpage) f.close() print 'Website successfully scraped' #soup = BeautifulSoup(fernsehserien_testdata.gethtmlo(), "html.parser") soup = BeautifulSoup(webpage, "html.parser") tddata = soup.select("tr") epdate, eptime, season, episode, title = [],[],[],[],[] for index, item in enumerate(tddata): if fmod(index,2) != 0 and index>0: #print item.text m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?>(\d{1,3})<.*?>(\d{1,2}).*?>(\d{1,2}).*?>([^<]+)", str(item)) if type(m) is not NoneType: epdate.append(m.group(1)) eptime.append(m.group(2)) season.append(m.group(4)) episode.append(m.group(5)) title.append(m.group(6)) return (epdate, season, episode, title, eptime)
def downloadWebpage(self): logging.info('Trying to get website information...please wait...') self.index0 = self.name.find( ' USA 20') #rb 2018-07-10 spart Einträge in tv_shows_db if self.index0 > -1: logging.info('USA 20 gefunden...please wait...') self.name = self.name[0:self.index0] + self.name[self.index0 + 9:] logging.info(self.name) else: logging.info(self.name) self.index0 = self.name.find( ' F 20') #rb 2022-01-14 spart Einträge in tv_shows_db if self.index0 > -1: logging.info('F 20 gefunden...please wait...') self.name = self.name[0:self.index0] + self.name[self.index0 + 7:] logging.info(self.name) cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.html' if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: logging.info('Use local file...') webpage = urlopen(cache) else: logging.info('self.name:' + self.name) if serieslinks.has_key(self.name): title = serieslinks[self.name] else: title = self.name.replace(' ', '-') title = title.replace('---', '-') # nur 1 - webpage = urlopen('https://www.fernsehserien.de/' + title + '/episodenguide').read() if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) logging.info('Website scraping => done') f = open(cache, 'w') f.write(webpage) f.close() self.soupobj = BeautifulSoup(webpage, "html.parser")
def getTimeTable(self, sender): logging.info('Trying to get timetable information...please wait...') if senderlinks.has_key(sender): senderlink = senderlinks[sender] else: logging.warning('Link zu Sender ' + sender + ' nicht gefunden') return 0 m = re.search("(.*)( \w{1,3} 20)(.*)", self.name) if type(m) is not NoneType: #rb 2022-01-14 #logging.info('Group(1): ' + m.group(1)) #logging.info('Group(2): ' + m.group(2)) #logging.info('Group(3): ' + m.group(3)) self.name = m.group(1) # self.index0 = self.name.find (' USA 20') #rb 2018-07-10s spart Einträge in tv_shows_db # if self.index0 > -1: # self.name = self.name[0: self.index0] + self.name[self.index0+9:] # else: # logging.info(self.name) # self.index0 = self.name.find (' F 20') #rb 2022-01-14 spart Einträge in tv_shows_db # if self.index0 > -1: # logging.info('F 20 gefunden...please wait...') # self.name = self.name[0: self.index0] + self.name[self.index0+7:] # logging.info(self.name) if serieslinks.has_key(self.name): title = serieslinks[self.name] else: title = self.name.replace(' ', '-') title = title.replace('---', '-') # nur 1 - #cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.html' #rb auskommentiert #cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + str(conf.SZaehler) + '_' + senderlink +'_ttlist.html' #rb cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + title + str( conf.SZaehler) + '_' + senderlink + '_ttlist.html' #rb # if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200: #12h # logging.info("Using recent cache file...") # webpage = urlopen(cache) if os.path.isfile(cache): self._test = datetime.datetime.fromtimestamp( os.path.getmtime(cache)).strftime("%Y.%m.%d%H-%M")[2:15] #rb if os.path.isfile(cache) and (self._test > self.SZeit): # bessere Bedingung: Datum der Cachedatei ist neuer als die Sendezeit des Films im Dateinamen logging.info("Using recent cache file..." + str(conf.SZaehler)) #webpage = urlopen(cache) webpage = urlopen(cache).read() #rb # conf.LetzteSeite = ("erfasst" in webpage) #rb conf.LetzteSeite = ("erfasst" in webpage) or ( 'title="früher"' not in webpage) #rb ersetzt else: # if serieslinks.has_key(self.name): #rb title vorverschoben # title = serieslinks[self.name] # #if serieslinks.has_key(self.name.replace(' ','-')): # # title = serieslinks[self.name.replace(' ','-')] # else: # title = self.name.replace(' ','-') #logging.info('Loading: https://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1') #rb auskommentiert logging.info('Loading: https://www.fernsehserien.de/' + title + '/sendetermine/' + senderlink + '/-' + str(conf.SZaehler)) #webpage = urlopen('https://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1').read() webpage = urlopen('https://www.fernsehserien.de/' + title + '/sendetermine/' + senderlink + '/-' + str(conf.SZaehler)).read() #rb if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)): os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER) f = open(cache, 'w') f.write(webpage) f.close() logging.info('Website scraping => done') # conf.LetzteSeite = ("erfasst" in webpage) #rb conf.LetzteSeite = ("erfasst" in webpage) or ( 'title="früher"' not in webpage) #rb ersetzt # conf.LetzteSeite = ("erfasst" in webpage) #rb verschoben soup = BeautifulSoup(webpage, "html.parser") # tddata = soup.select("tr") epdate, eptime, season, episode, title = [], [], [], [], [] if ('<tr' in webpage): rows = soup.findAll('tr') logging.info('mit StaffelNr') for row in rows: # Datum Start Staffel Episode Titel # m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*(\d{1,2})x(\d{1,3}).*\-episodentitel.*\-sendetermine\"\>([\wäßüöÄÜÖ \-\:]*)", str(row)) m = re.search( "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*?(\d{1,2})x(\d{1,3})(.*)", str(row)) # ? ist entscheidend!!! if type(m) is not NoneType: epdate.append(m.group(1)) eptime.append(m.group(2)) season.append(m.group(3)) episode.append(m.group(4)) # m1 = re.search("\-episodentitel.*title=\"zur Episode\".([\wäßüöÄÜÖ \-\:]*)", m.group(5)) # m1 = re.search(".*title=\"zur Episode\".([\wäßüöÄÜÖ \-\:]*)", m.group(5)) # m1 = re.search(".*title=\"zur Episode\".([\wäßüöÄÜÖ() \-\:\.]*)", m.group(5)) #rb 07.01.2022 m1 = re.search( ".*title=\"zur Episode\".([\wäßüöÄÜÖ() \-\:\.\,]*)", m.group(5)) #rb 21.02.2022 Komma hinzugefügt if type(m1) is not NoneType: title.append(m1.group(1)) else: # m1 = re.search('-episodentitel.*-sendetermine.\>([\wäßüöÄÜÖ \-\:]*)', s1) # m1 = re.search('*-sendetermine.\>([\wäßüöÄÜÖ \-\:]*)', s1) # m1 = re.search('*-sendetermine.\>([\wäßüöÄÜÖ() \-\:\.]*)', s1) #rb 07.01.2022 m1 = re.search( '*-sendetermine.\>([\wäßüöÄÜÖ() \-\:\.\,]*)', s1) #rb 21.02.2022 Komma hinzugefügt if type(m1) is not NoneType: title.append(m1.group(1)) if len( title ) == 0: # notwendig, wenn keine StaffelNr eingegeben ist, z.B.: Wunderschoen, Wilsberg rb 03.11.2020 logging.info('keine StaffelNr') for row in rows: # m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\w \-\:]*)",str(item)) # Episode kann auch 3-stellig sein # m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ \-\:]*)",str(row)) # Episode kann auch 3-stellig sein # m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ() \-\:\.]*)",str(row)) #rb 07.01.2022 # Episode kann auch 3-stellig sein m = re.search( "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)", str(row)) #rb 21.02.22 # Komma hinzugefügt if type(m) is not NoneType: epdate.append(m.group(1)) eptime.append(m.group(2)) season.append('1') episode.append(m.group(4)) title.append(m.group(5)) ### return (epdate, season, episode, title, eptime) else: rows = soup.findAll('a', href=True) for row in rows: # m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*(\d{1,2})x(\d{1,3}).*\-episodentitel.*\-sendetermine\"\>([\wäßüöÄÜÖ \-\:]*)", str(row)) # m = re.search("folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ \-\:]*)", str(row)) # m = re.search("folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.]*)", str(row)) m = re.search( "folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)", str(row)) #rb 21.02.2022 Komma hinzugefügt if type(m) is not NoneType: epdate.append(m.group(3)) eptime.append(m.group(4)) season.append(m.group(1)) episode.append(m.group(2)) title.append(m.group(5)) if len( title ) == 0: # notwendig, wenn keine StaffelNr eingegeben ist, z.B.: Wunderschoen, Wilsberg rb 03.11.2020 logging.info('keine StaffelNr') for row in rows: # m = re.search("folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ \-\:]*)", str(row)) # m = re.search("folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.]*)", str(row)) #rb 07.01.2022 m = re.search( "folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)", str(row)) #rb 21.02.2022 Komma hinzugefügt if type(m) is not NoneType: epdate.append(m.group(2)) eptime.append(m.group(3)) season.append('1') episode.append(m.group(1)) title.append(m.group(4)) return (epdate, season, episode, title, eptime)