def _setDetailsForPerformances(self): performance_tuples = self._getFurtherPerformances() if performance_tuples: for a_tuple in performance_tuples: date = a_tuple[0] # iso_8601_date = Lepistant.formatDateToISO8601(date) # Only use year, month and day to look up dates for performances. lookup_date = date.split('T')[0] + 'T00:00' lookup_time = date.split('T')[1] if lookup_time != '00:00': self._setDefaultTime(lookup_time) url = a_tuple[1] file_path = Lepistant.createFilePath(self.file_path_on_disk, date, 'performance') soup = Lepistant.getSoup(url, file_path) if lookup_date in self.performances: # Updating the date with a more precise date including Weekday and Time. performance = self.performances[lookup_date] # TODO: Updating of the date should be done in a single function. performance.date = str() performance['date'] = date performance.setDetails(soup, self.title)
def createJSONFile(file_name, leporello, prettify): file_path = Lepistant.createFilePath(Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name, 'json') json_file = open(file_path, 'w+') json.dump(leporello, json_file, sort_keys=True, ensure_ascii=False) if (prettify): file_path_formatted = Lepistant.createFilePath(Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name + '_formatted', 'json') json_file_formatted = open(file_path_formatted, 'w+') json.dump(leporello, json_file_formatted, sort_keys=True, ensure_ascii=False, indent=4)
def _setDates(self): dates = list() try: # Set locale time from en_US to de_DE for formatting calendar dates. locale.setlocale(locale.LC_TIME, 'de_DE.utf-8') dates_string = self.play_item_soup.findAll('p')[1].span.nextSibling # dates_string = "Neues Schauspielhaus / Termine: 10. | 14. | 18. September 2011, 01. | 16. | 22. Oktober 2011, 18. | 29. November 2011, 08. | 11. | 16. | 29. Dezember 2011, 27. Januar 2012, 03. | 10. | 15. | 26. Februar 2012, 27. März 2012" (location_part, seperator, dates_part) = dates_string.partition(':') dates_per_month = dates_part.split(',') # dates_per_month = "[' 10. | 14. | 18. September 2011', ' 01. | 16. | 22. Oktober 2011', ' 18. | 29. November 2011', ' 08. | 11. | 16. | 29. Dezember 2011', ' 27. Januar 2012', ' 03. | 10. | 15. | 26. Februar 2012', ' 27. M\xc3\xa4rz 2012']" for dates_of_one_month in dates_per_month: # dates_of_one_month = " 10. | 14. | 18. September 2011" raw_dates = dates_of_one_month.split('|') # raw_dates = "[' 10. ', ' 14. ', ' 18. September 2011']" raw_date_with_month_name = raw_dates[-1] # raw_date_with_month_name = " 18. September 2011" month = raw_date_with_month_name.split()[1] # month = "September" year = raw_date_with_month_name.split()[2] # year = "2011" for i in range(len(raw_dates) - 1): # i = "0" # date = "10. September 2011" date_string_raw = raw_dates[i].lstrip() + month + ' ' + year date_time = datetime.datetime.strptime(date_string_raw, '%d. %B %Y') # For json serialization we have to save the date as a string. date_string = Lepistant.formatDatetimeToString(date_time) dates.append(date_string) # Formatting the last date in the list: "18. September 2011" last_date_unformatted = raw_dates[-1].lstrip().rstrip() last_date_time = datetime.datetime.strptime(last_date_unformatted, '%d. %B %Y') last_date_string = Lepistant.formatDatetimeToString(last_date_time) # Converting the date_string to ISO 8601 format: YYYY-MM-DDThh:mm+01 # iso_8601_date = Lepistant.formatDateToISO8601(last_date_string) dates.append(last_date_string) logger.info('%s - set dates: %s', self.title, dates) except ValueError as verr: logger.error('Failed to format date "%s" for play "%s" due to: %s', date_string_raw, self.title, str(verr)) except: logger.error('') logger.error('Failed to set dates for play "%s". Therefore setting dates to an empty list.', self.title) logger.error('') # Set locale time back from de_DE to en_US. locale.setlocale(locale.LC_TIME, 'en_US.utf-8') logger.info('') return self._setKey('dates', dates)
def _setData(self, data_list): role = None full_name = '' url = Lepistant.NOT_AVAILABLE for element in data_list: if 'class="eventDetailPerson"' in str(element): full_name = element.string.lstrip().rstrip() elif 'class="eventDetailPersonRole"' in str(element): try: role = element.string.split(':')[0].lstrip().rstrip() except: logger.info( 'Setting role to "%s" since no role could be find in data_list: %s', role, data_list) elif 'class="eventDetailPersonLink"' in str(element): full_name = element.string.lstrip().rstrip() url = Lepistant.URL_PREFIX + re.search('href=\"(.+?)\"', str(element)).group(1) # Check if artist already exists. if full_name in leporello.artists: artist = leporello.artists[full_name] self.full_name = self._setKey('full_name', artist.full_name) self.first_name = self._setKey('first_name', artist.first_name) self.middle_name = self._setKey('middle_name', artist.middle_name) self.last_name = self._setKey('last_name', artist.last_name) self.producer_roles = self._setKey('producer_roles', artist.producer_roles) self.artist_roles = self._setKey('artist_roles', artist.artist_roles) self.photo = self._setKey('photo', artist.photo) self.biography = self._setKey('biography', artist.biography) self.appearances = self._setKey('appearances', artist.appearances) else: self._setName(full_name) if url: file_path = Lepistant.createFilePath( Lepistant.REL_PATH_ARTISTS_FOLDER, full_name, 'html') soup = Lepistant.getSoup(url, file_path) self._setDetails(soup) if role: self._addRole(role) # Add artist to the leporello artists dictionary so we can check later if the artist already exists. # If the artist exists we only update his data. leporello.artists[self.full_name] = self return data_list
def createJSONFile(file_name, leporello, prettify): file_path = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name, 'json') json_file = open(file_path, 'w+') json.dump(leporello, json_file, sort_keys=True, ensure_ascii=False) if (prettify): file_path_formatted = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name + '_formatted', 'json') json_file_formatted = open(file_path_formatted, 'w+') json.dump(leporello, json_file_formatted, sort_keys=True, ensure_ascii=False, indent=4)
def _setData(self, data_list): role = None full_name = '' url = Lepistant.NOT_AVAILABLE for element in data_list: if 'class="eventDetailPerson"' in str(element): full_name = element.string.lstrip().rstrip() elif 'class="eventDetailPersonRole"' in str(element): try: role = element.string.split(':')[0].lstrip().rstrip() except: logger.info('Setting role to "%s" since no role could be find in data_list: %s', role, data_list) elif 'class="eventDetailPersonLink"' in str(element): full_name = element.string.lstrip().rstrip() url = Lepistant.URL_PREFIX + re.search('href=\"(.+?)\"', str(element)).group(1) # Check if artist already exists. if full_name in leporello.artists: artist = leporello.artists[full_name] self.full_name = self._setKey('full_name', artist.full_name) self.first_name = self._setKey('first_name', artist.first_name) self.middle_name = self._setKey('middle_name', artist.middle_name) self.last_name = self._setKey('last_name', artist.last_name) self.producer_roles = self._setKey('producer_roles', artist.producer_roles) self.artist_roles = self._setKey('artist_roles', artist.artist_roles) self.photo = self._setKey('photo', artist.photo) self.biography = self._setKey('biography', artist.biography) self.appearances = self._setKey('appearances', artist.appearances) else: self._setName(full_name) if url: file_path = Lepistant.createFilePath(Lepistant.REL_PATH_ARTISTS_FOLDER, full_name, 'html') soup = Lepistant.getSoup(url, file_path) self._setDetails(soup) if role: self._addRole(role) # Add artist to the leporello artists dictionary so we can check later if the artist already exists. # If the artist exists we only update his data. leporello.artists[self.full_name] = self return data_list
def _setPhoto(self, soup): img_tag = soup.find('img', {"class": "person-picture"}) url = Lepistant.getURLFromImageTag(img_tag) if url: photo = url else: photo = Lepistant.NOT_AVAILABLE self.photo = self._setKey('photo', photo) logger.info('%s - set photo: "%s".', self.full_name, self.photo)
def _setFurtherInfo(self): further_info = Lepistant.NOT_AVAILABLE try: further_info_paragraphs = self._getParagraphsForContent('Weitere Texte') further_info = Lepistant.formatParagraphsToString(further_info_paragraphs) logger.info('%s - set further_info: "%s..."', self.title, repr(further_info[:Lepistant.LOG_MESSAGE_LENGTH])) except: logger.warning('Failed to set further_info for play "%s". Therefore setting further_info to %s.', self.title, Lepistant.NOT_AVAILABLE) self.further_info = self._setKey('further_info', further_info)
def _setBiography(self, soup): biography_p_tag = soup.findAll('p', {"class": "person-description"}) biography = Lepistant.formatParagraphsToString(biography_p_tag) if biography: biography = biography else: biography = Lepistant.NOT_AVAILABLE self.biography = self._setKey('biography', biography) logger.info('%s - set biography: "%s...".', self.full_name, repr(self.biography[:Lepistant.LOG_MESSAGE_LENGTH]))
def _setPhoto(self, soup): img_tag = soup.find('img', {"class": "person-picture"}) url = Lepistant.getURLFromImageTag(img_tag) if url: photo = url else: photo = Lepistant.NOT_AVAILABLE self.photo = self._setKey('photo', photo) logger.info('%s - set photo: "%s".', self.full_name, self.photo)
def _setSummary(self): summary = Lepistant.NOT_AVAILABLE try: summary_paragraphs = self._getParagraphsForContent('Inhalt') summary = Lepistant.formatParagraphsToString(summary_paragraphs) logger.info('%s - set summary: "%s..."', self.title, repr(summary[:Lepistant.LOG_MESSAGE_LENGTH])) except: logger.warning('Failed to set summary for play "%s". Therefore setting summary to %s.', self.title, Lepistant.NOT_AVAILABLE) self.summary = self._setKey('summary', summary)
def _setCritics(self): critics = Lepistant.NOT_AVAILABLE try: critics_paragraphs = self._getParagraphsForContent('Pressestimmen') critics = Lepistant.formatParagraphsToString(critics_paragraphs) logger.info('%s - set critics: "%s..."', self.title, repr(critics[:Lepistant.LOG_MESSAGE_LENGTH])) except: logger.warning('Failed to set critics for play "%s". Therefore setting critics to %s.', self.title, Lepistant.NOT_AVAILABLE) self.critics = self._setKey('critics', critics)
def _setBiography(self, soup): biography_p_tag = soup.findAll('p', {"class": "person-description"}) biography = Lepistant.formatParagraphsToString(biography_p_tag) if biography: biography = biography else: biography = Lepistant.NOT_AVAILABLE self.biography = self._setKey('biography', biography) logger.info('%s - set biography: "%s...".', self.full_name, repr(self.biography[:Lepistant.LOG_MESSAGE_LENGTH]))
def _setPhotos(self): photos = list() try: img_tags = self.play_detail_soup.find('div', {"class": "thumbnails"}).findAll('img') for img_tag in img_tags: img_url = Lepistant.getURLFromImageTag(img_tag) photos.append(img_url) logger.info('%s - set photos: %s', self.title, photos) except AttributeError as attrerr: logger.warning('Failed to set photos for play "%s" due to: %s. Therefore setting photos to an empty list.', self.title, str(attrerr)) self.photos = self._setKey('photos', photos)
def getPlays(leporello_info): plays = [] # Initializing the Leporello Assistent with meta information for web scraping. # Lepistant.setInfo(leporello_info) leporelloFileNameOnDisk = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, Lepistant.FILE_NAME_LEPORELLO, 'leporello') soup = Lepistant.getSoup(leporello_info[URL_LEPORELLO], leporelloFileNameOnDisk) playItems = Lepistant.getTagsByClass(soup, 'div', leporello_info[CSS_CLASS_PLAY_ITEM]) leporello_info[PLAY_ITEMS] = playItems # Get only one play item for testing purposes # playItem = playItems[20] # WirAlleAnders playItem = playItems[12] # AltArmArbeitslos play = Play(playItem) play.link = Lepistant.getURLFromTagContent(playItem) formatted_title = Lepistant.removeNonAlphanumericCharacters(play.title) play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' play.file_name_on_disk = Lepistant.createFilePath( play.file_path_on_disk, play.title, 'html') soup = Lepistant.getSoup(play.link, play.file_name_on_disk) play.setPlayDetails(soup) plays.append(play) # for playItem in playItems: # logger.info('>>>>>>>>>>>>>>>>>>>>>>>> Fetching new play <<<<<<<<<<<<<<<<<<<<<<<<<') # play = Play(playItem) # play.link = Lepistant.getURLFromTagContent(playItem) # formatted_title = Lepistant.removeNonAlphanumericCharacters(play.title) # play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' # play.file_name_on_disk = Lepistant.createFilePath( # play.file_path_on_disk, # play.title, # 'html') # soup = Lepistant.getSoup(play.link, play.file_name_on_disk) # play.setPlayDetails(soup) # # logger.info('') # # plays.append(play) return plays
def _getFurtherPerformances(self): # Set locale time from en_US to de_DE for formatting calendar dates. locale.setlocale(locale.LC_TIME, 'de_DE.utf-8') perfomance_tuples = list() try: performance_tags = self.play_detail_soup.find('div', {"class": "further-performances"}).findAll('div') for perfomance_tag in performance_tags: date_link_tag = perfomance_tag.findAll('a', text=True)[0].parent date_string = date_link_tag.string date_time = datetime.datetime.strptime(date_string, '%a, %d.%m.%Y / %H.%M Uhr') date = Lepistant.formatDatetimeToString(date_time) url = Lepistant.getURLFromLinkTag(date_link_tag) performance_tuple = (date, url) perfomance_tuples.append(performance_tuple) except: logger.warning('Failed to get further performances for play "%s". Therefore returning an empty list.', self.title) # Set locale time back from de_DE to en_US. locale.setlocale(locale.LC_TIME, 'en_US.utf-8') return perfomance_tuples
def _setSponsors(self): sponsors = list() try: img_tags = self.play_detail_soup.find('div', {"class": "sponsors clearfix"}) if img_tags: for img_tag in img_tags: img_url = Lepistant.getURLFromImageTag(img_tag) sponsors.append(img_url) logger.info('%s - set sponsors: %s', self.title, sponsors) else: logger.info('%s - Play does not have any sponsors. Therefore setting sponsors to an empty list.', self.title) except AttributeError as attrerr: logger.warning('Failed to set sponsor logos for play "%s" due to: %s. Therefore setting sponsors to an empty list.', self.title, str(attrerr)) self.sponsors = self._setKey('sponsors', sponsors)
def getPlays(leporello_info): plays = [] # Initializing the Leporello Assistent with meta information for web scraping. # Lepistant.setInfo(leporello_info) leporelloFileNameOnDisk = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, Lepistant.FILE_NAME_LEPORELLO, 'leporello') soup = Lepistant.getSoup(leporello_info[URL_LEPORELLO], leporelloFileNameOnDisk) playItems = Lepistant.getTagsByClass( soup, 'div', leporello_info[CSS_CLASS_PLAY_ITEM]) leporello_info[PLAY_ITEMS] = playItems # Get only one play item for testing purposes # playItem = playItems[20] # WirAlleAnders playItem = playItems[12] # AltArmArbeitslos play = Play(playItem) play.link = Lepistant.getURLFromTagContent(playItem) formatted_title = Lepistant.removeNonAlphanumericCharacters(play.title) play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' play.file_name_on_disk = Lepistant.createFilePath( play.file_path_on_disk, play.title, 'html') soup = Lepistant.getSoup(play.link, play.file_name_on_disk) play.setPlayDetails(soup) plays.append(play) # for playItem in playItems: # logger.info('>>>>>>>>>>>>>>>>>>>>>>>> Fetching new play <<<<<<<<<<<<<<<<<<<<<<<<<') # play = Play(playItem) # play.link = Lepistant.getURLFromTagContent(playItem) # formatted_title = Lepistant.removeNonAlphanumericCharacters(play.title) # play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' # play.file_name_on_disk = Lepistant.createFilePath( # play.file_path_on_disk, # play.title, # 'html') # soup = Lepistant.getSoup(play.link, play.file_name_on_disk) # play.setPlayDetails(soup) # # logger.info('') # # plays.append(play) return plays
def getSpecificTheaterPlays(leporello_info, file_name, play_type, url): theater_plays = [] leporello_file_name_on_disk = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name, 'html') soup = Lepistant.getSoup(url, leporello_file_name_on_disk) theater_play_items = Lepistant.getTagsByClass( soup, 'div', leporello_info[CSS_CLASS_PLAY_ITEM]) # for play_item in theater_play_items: # title = getTitleFromPlayItem(play_item) # theater_plays.append(title) for play_item in theater_play_items: logger.info( '>>>>>>>>>>>>>>>>>>>>>>>> Fetching new play <<<<<<<<<<<<<<<<<<<<<<<<<' ) play = Play(play_item) play.setType(play_type) play.link = Lepistant.getURLFromTagContent(play_item) formatted_title = Lepistant.removeNonAlphanumericCharacters( play.title) play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' play.file_name_on_disk = Lepistant.createFilePath( play.file_path_on_disk, play.title, 'html') soup = Lepistant.getSoup(play.link, play.file_name_on_disk) play.setPlayDetails(soup) setDefaultTimeForPerformancesInPlay(play) logger.info('') theater_plays.append(play) return theater_plays
def getSpecificTheaterPlays(leporello_info, file_name, play_type, url): theater_plays = [] leporello_file_name_on_disk = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name, 'html') soup = Lepistant.getSoup(url, leporello_file_name_on_disk) theater_play_items = Lepistant.getTagsByClass(soup, 'div', leporello_info[CSS_CLASS_PLAY_ITEM]) # for play_item in theater_play_items: # title = getTitleFromPlayItem(play_item) # theater_plays.append(title) for play_item in theater_play_items: logger.info('>>>>>>>>>>>>>>>>>>>>>>>> Fetching new play <<<<<<<<<<<<<<<<<<<<<<<<<') play = Play(play_item) play.setType(play_type) play.link = Lepistant.getURLFromTagContent(play_item) formatted_title = Lepistant.removeNonAlphanumericCharacters(play.title) play.file_path_on_disk = Lepistant.REL_PATH_PLAYS_FOLDER + formatted_title + '/' play.file_name_on_disk = Lepistant.createFilePath( play.file_path_on_disk, play.title, 'html') soup = Lepistant.getSoup(play.link, play.file_name_on_disk) play.setPlayDetails(soup) setDefaultTimeForPerformancesInPlay(play) logger.info('') theater_plays.append(play) return theater_plays
if (prettify): file_path_formatted = Lepistant.createFilePath( Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name + '_formatted', 'json') json_file_formatted = open(file_path_formatted, 'w+') json.dump(leporello, json_file_formatted, sort_keys=True, ensure_ascii=False, indent=4) # Starting our leporello scraper # Initializing the Leporello Assistent with meta information for web scraping. Lepistant.setInfo(leporello_info) # leporello['plays'] = getPlays(leporello_info) plays = list() straight_theater_plays = getSpecificTheaterPlays( leporello_info, 'SchauspielBremen', 'straight', leporello_info[URL_STRAIGHT_THEATER]) # leporello['straight_theater'] = straight_theater_plays # plays.append(straight_theater_plays) plays.extend(straight_theater_plays) musical_theater_plays = getSpecificTheaterPlays( leporello_info, 'OperBremen', 'musical', leporello_info[URL_MUSICAL_THEATER])
file_path = Lepistant.createFilePath(Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name, 'json') json_file = open(file_path, 'w+') json.dump(leporello, json_file, sort_keys=True, ensure_ascii=False) if (prettify): file_path_formatted = Lepistant.createFilePath(Lepistant.REL_PATH_DOWNLOADS_FOLDER, file_name + '_formatted', 'json') json_file_formatted = open(file_path_formatted, 'w+') json.dump(leporello, json_file_formatted, sort_keys=True, ensure_ascii=False, indent=4) # Starting our leporello scraper # Initializing the Leporello Assistent with meta information for web scraping. Lepistant.setInfo(leporello_info) # leporello['plays'] = getPlays(leporello_info) plays = list() straight_theater_plays = getSpecificTheaterPlays(leporello_info, 'SchauspielBremen', 'straight', leporello_info[URL_STRAIGHT_THEATER]) # leporello['straight_theater'] = straight_theater_plays # plays.append(straight_theater_plays) plays.extend(straight_theater_plays) musical_theater_plays = getSpecificTheaterPlays(leporello_info, 'OperBremen', 'musical', leporello_info[URL_MUSICAL_THEATER]) # leporello['musical_theater'] = musical_theater_plays plays.extend(musical_theater_plays) dance_theater_plays = getSpecificTheaterPlays(leporello_info, 'TanztheaterBremen', 'dance', leporello_info[URL_DANCE_THEATER])