def get_last_log_info(parser_name): result_dict = dict( log_name="Parser not started yet", new_items=0, total_channels=0, total_programs=0, log_cr_tm="Parser not started yet", log_status="Parser not started yet", execution_time=0, ) db.execute( """ SELECT * FROM log WHERE parser_name='%(parser_name)s' ORDER BY cr_tm DESC; """ % {"parser_name": parser_name} ) result = db.fetchone() if not result: return result_dict return dict( log_name=result.get("parser_name"), new_items=result.get("count_new_items"), total_channels=result.get("total_channels"), total_programs=result.get("total_programs"), log_cr_tm=result.get("cr_tm"), log_status=result.get("success"), execution_time=hours_minutes_seconds_from_seconds(result.get("execution_time")), )
def parse_url_channels(self): # run_phantomjs() time.sleep(20) self.driver_start() write_to_log('Start channels parsing') func_tm = time.time() page_height = 0 elements = {} scroll_height_script = """ return window.innerHeight + window.scrollY """ while page_height != self.driver.execute_script(scroll_height_script): page_height = self.driver.execute_script(scroll_height_script) self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(5) channels = self.driver.find_elements_by_css_selector(self.get_channel_css_selector()) write_to_log('Found %s channels' % len(channels)) for channel in channels: time.sleep(1) name = channel.find_element_by_css_selector( 'span.tv-channel-title__text').text.encode('utf-8') href = channel.get_attribute('href').encode('utf-8') icon = channel.find_elements_by_css_selector( 'div.tv-channel-title__icon > span[class$="image_type_channel"] > span') if icon: icon = self.get_background_image(icon[0]).encode('utf-8') if (href is not None) and (href not in elements.keys()): elements[href] = {'name': name, 'icon': icon} save_records = SaveRecordsToDb() new_elements_count = save_records.save_channels_to_db(elements) func_tm = int(time.time()-func_tm) text_for_log = 'Channels parsed successfully.{elements_count} new channels.' \ 'Execution time: {func_tm}'.\ format(elements_count=new_elements_count, func_tm=hours_minutes_seconds_from_seconds(func_tm)) send_email(subject='Parser notification', text=text_for_log) write_to_log(text_for_log) SaveRecordsToDb.insert_log_info(execution_time=func_tm, new_items=new_elements_count) self.driver.close()
def parse_tv_programs(self): # run_phantomjs() time.sleep(20) self.driver_start() write_to_log('Start programs parsing') func_tm = time.time() ids_and_links = GetRecordsFromDb().get_channels_id_and_link() date_today = get_date_and_time_with_timezone() count_programs = 0 for id_and_link in ids_and_links: channel = Channel(channel_id=id_and_link['id']) channel.update() if id_and_link.get('link'): self.driver.get(id_and_link.get('link')) time.sleep(4) if '404' not in self.driver.title: if not channel.description or not channel.web_site: channel_description = self.driver.find_elements_by_css_selector( "tr.b-row div.b-tv-channel-content__text") channel_description = channel_description[0].text.encode('utf-8')\ if channel_description else "This channel does not have description" channel_web_site = self.driver.find_elements_by_css_selector( "div.b-tv-channel-content__channel-info > " "div.b-tv-channel-content__links > a") channel_web_site = channel_web_site[0].get_attribute( 'href').encode('utf-8') \ if channel_web_site else "This channel does not have web site" if len(channel_description) > Channel.description['length']: channel_description = channel_description[:Channel.description[ 'length']] if len(channel_web_site) > Channel.web_site['length']: channel_web_site = channel_web_site[:Channel.web_site['length']] channel.description, channel.web_site = \ channel_description, channel_web_site channel.update() dates_of_week = list() for date in self.driver.find_elements_by_css_selector( 'div.tv-filter-days__viewport > div.tv-filter-days__items > ' 'div.tv-filter-days__item'): date_of_week = re.findall(r'(\d{4}-\d{2}-\d{2})T', date.get_attribute('data-bem'))[0] if datetime.datetime.strptime(date_today, '%Y-%m-%d') <= \ datetime.datetime.strptime(date_of_week, '%Y-%m-%d'): dates_of_week.append(date_of_week) dates_of_week = dates_of_week[:7] if len(dates_of_week) > 7 else dates_of_week for day in dates_of_week: self.driver.get("%(channel_link)s?date=%(date)s" % {'channel_link': id_and_link['link'], 'date': day}) time.sleep(1) channels_tags = self.driver.find_elements_by_css_selector( 'div.b-tv-channel-schedule__items > ' 'div.b-tv-channel-schedule__item > a') tv_channels = [] for channel in channels_tags: program_name = channel.find_element_by_class_name( 'tv-event__title-inner').text show_time = channel.find_element_by_class_name( 'tv-event__time-text').text + ':00' show_date = datetime.datetime.strptime(day, '%Y-%m-%d') genre = json.loads(channel.get_attribute( 'data-bem'))['tv-event']['genre'] tv_channels.append(TvProgram(name=program_name, genre=genre, show_date=show_date, show_time=show_time)) count_programs += 1 SaveRecordsToDb.save_programs(id_and_link['id'], tv_channels) else: write_to_log('Error. Page {page} not found'.format( page=self.driver.current_url)) send_email(subject='Page not found', text='Page {page} not found'.format(page=self.driver.current_url)) else: write_to_log('Wrong channel link %s. Channel id %s' % (id_and_link.get('link'), id_and_link.get('id'))) func_tm = time.time() - func_tm text_for_log = 'Tv programs parsed successfully.' \ 'Execution time: %s' % hours_minutes_seconds_from_seconds(func_tm) send_email(subject='Parser notification', text=text_for_log) write_to_log(text_for_log) SaveRecordsToDb.insert_log_info(parser_name='tv_programs', new_items=count_programs, execution_time=func_tm) self.driver.close()