def get_luxor_schedules(): query = 'QueryCode=GetSessions' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') #create_dump_file('%s_schedules' % source.dump, settings.API_DUMP_PATH, data) ''' xml = open('%s/dump_%s_schedules.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' films = get_source_data(source, 'film', 'dict') cinemas = get_source_data(source, 'cinema', 'dict') halls = get_source_data(source, 'hall', 'dict') schedules = get_source_data(source, 'schedule', 'list') xml_data = BeautifulSoup(data, from_encoding="utf-8") for session in xml_data.findAll('session'): sch_id = session['id'] if sch_id not in schedules: cinema_id = session.theatre['id'].encode('utf-8') hall_id = session.theatre.hall['id'].encode('utf-8') film_id = session.movie['id'].encode('utf-8') cinema_obj = cinemas.get(cinema_id) film_obj = films.get(film_id) hall_obj = halls.get(hall_id) if cinema_obj and film_obj and hall_obj: showdate = session.date.string.encode('utf-8') showtime = session.time.string.encode('utf-8') day, month, year = showdate.split('.') hours, minutes = showtime.split(':') dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=film_obj, cinema=cinema_obj, dtime=dtime, hall=hall_obj.kid, ) cron_success('xml', source.dump, 'schedules', 'Сеансы')
def get_kinoteatrua_releases(): ''' Получение укр.релизов ''' opener = give_me_cookie() source = ImportSources.objects.get(url='http://kino-teatr.ua/') films_dict = get_source_data(source, 'film', 'dict') releases = SourceReleases.objects.select_related('film').filter( source_obj=source) releases_dict = {} for i in releases: releases_dict[i.film.source_id] = i url = '%sfilms-near.phtml' % source.url req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for ind, i in enumerate(data.findAll('a', {'class': 'searchItemLink'})): film_url = i.get('href') film_id = film_url.replace('http://kino-teatr.ua/film/', '').replace('.phtml', '').encode('utf-8') film_obj = films_dict.get(film_id) if film_obj: req2 = opener.open(urllib2.Request(film_url)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") block = data2.find('div', id='filmInfo') strong = block.find('strong', text=u"Премьера (в Украине): ") day, month, year = strong.find_next_sibling( "a").text.strip().split('.') showdate = datetime.date(int(year), int(month), int(day)) release_obj = releases_dict.get(film_id) if release_obj: if release_obj.release != showdate: release_obj.release = showdate release_obj.save() else: release_obj = SourceReleases.objects.create( source_obj=source, film=film_obj, release=showdate, ) releases_dict[film_id] = release_obj if ind % 1 == 0: time.sleep(random.uniform(1.0, 3.0)) cron_success('html', source.dump, 'releases', 'Укр.релизы')
def get_okinoua_cities(): """ Парсинг городов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список городов с таблицы SourceCities в виде списка cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' # Открываем страницу с городами url = '%skinoafisha-kiev/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for ul in page.findAll('ul', {'class': 'blist'}): for li in ul.findAll('li'): id = li.a.get('href').replace('/', '') name = li.a.string.encode('utf-8').strip() name_slug = low(del_separator(name)) # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то if id not in cities_ids: # идентифицируем новый город city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') # если идентифицировали, то записываем в таблицу SourceCities if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', 'okinoua', 'cities', 'Укр. города')
def get_rambler_cities(): source = ImportSources.objects.get(url='http://www.rambler.ru/') cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' ''' # LOCALHOST f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: # --- end localhost ''' # SERVER url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY # dump_rambler_city.xml req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('city'): id = i.cityid.string name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if id not in cities_ids: city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('xml', source.dump, 'cities', 'Города')
def get_premierzal_cities(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cities = get_source_data(source, 'city', 'list') data_nof_city = '' req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) block = data.find('div', {'class': 'drop'}) for i in block.findAll('a'): city_name = i.text.encode('utf-8').strip() city_id = low(del_separator(city_name)) if city_id.decode('utf-8') not in cities: city = City.objects.filter(name__name=city_id, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_id) cities.append(city_id.decode('utf-8')) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', source.dump, 'cities', 'Города')
def get_ivi_file(): ''' Получение txt файла ''' source = ImportSources.objects.get(url='http://antipiracy.ivi.ru/') films = get_source_data(source, 'film', 'list') url = '%s-/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: links = BeautifulSoup(req.read(), from_encoding="windows-1251") for i in links.findAll('a'): link = i.string.encode('utf-8') if 'in one file.txt' in link: req2 = urllib.urlopen('%s%s' % (url, i.get('href'))) data = BeautifulSoup(req2.read(), from_encoding="windows-1251") file = str(data).replace('<html><head></head><body>', '').replace('</body></html>', '') create_dump_file(source.dump, settings.API_DUMP_PATH, file, 'txt') cron_success('html', source.dump, 'file', 'txt файл с данными')
def get_vkinocomua_cities_and_cinemas(): nofcities = [] nofcinemas = [] data_nof_cinema = '' data_nof_city = '' source = ImportSources.objects.get(url='http://vkino.com.ua/') cinemas_dict = get_source_data(source, 'cinema', 'dict') cities_dict = get_source_data(source, 'city', 'dict') req = urllib.urlopen('%safisha/kiev' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities_tag = data.find('select', id='city-selector') for ind, i in enumerate(cities_tag.findAll('option')): if i['value']: city_name = i.string.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i['value'].encode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj and city_id not in nofcities: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id = city_id, source_obj = source, city = city[0], name = city_name, ) cities_dict[city_id] = city_obj else: data_nof_city += '<city name="%s" slug="%s"></city>' % (city_name, city_slug) nofcities.append(city_id) if city_obj: url = '%scinema/%s' % (source.url, city_id) req_cinema = urllib.urlopen(url) if req_cinema.getcode() == 200: data_cinema = BeautifulSoup(req_cinema.read(), from_encoding="utf-8") for tag in data_cinema.findAll('a', {'class': 'cinema'}): cinema_name = tag.string.encode('utf-8') cinema_slug = low(del_separator(cinema_name)) cinema_id = tag.get('href').replace('/cinema/%s/' % city_id, '').encode('utf-8') cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj and cinema_id not in nofcinemas: filter = {'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city} cinema_kid = cinema_identification(cinema_slug, filter) if cinema_kid: try: cin_obj = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id = cinema_id, source_obj = source, city = city_obj, cinema = cin_obj, name = cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: nofcinemas.append(cinema_id) data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city_obj.city.kid) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cities_and_cinemas', 'Города и кинотеатры')
def get_megamag(): ''' Получение urls фильмов ''' import cookielib def give_me_cookie(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler()) return opener ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://megamag.by/') sfilm_clean(source) megamag_cities_dict = get_source_data(source, 'city', 'dict') megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) cities_data = {} data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] schedules_data = [] opener = give_me_cookie() req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php')) event_dict = {} if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities = data.find('div', id="box-region") for i in cities.findAll('a'): city_name = i.text.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i.get('href').replace( 'http://kinoteatr.megamag.by/index.php?region_id=', '') mcity = megamag_cities_dict.get(city_id) if not mcity: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: mcity = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if mcity: cities_data[city_name] = mcity try: cinemas_tag = data.findAll('td', {'class': 'Cinema_new_box_1_BoxText'}, limit=1)[0] except IndexError: cinemas_tag = None if cinemas_tag: for i in cinemas_tag.findAll('a'): cinema_url = i.get('href') cinema_id = cinema_url.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '') cinema_obj = megamag_cinemas_dict.get(cinema_id) opener = give_me_cookie() try: req2 = opener.open(urllib2.Request(cinema_url)) if req2.getcode() == 200: schedules_page = BeautifulSoup(req2.read(), from_encoding="utf-8") city_name = schedules_page.findAll( 'div', {'class': 'object_param_value'}, limit=1)[0].text.encode('utf-8') city_obj = cities_data.get(city_name) if city_obj: cinema_name = schedules_page.find( 'div', { 'class': 'object_title' }).text.encode('utf-8') cinema_name = cinema_name.replace('"', '').replace( 'Кинотеатр', '') cinema_slug = low(del_separator(cinema_name)) cinema_ig_id = u'%s__%s' % ( cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: cinema_kid = cinema_obj.cinema.code if cinema_kid: for event in schedules_page.findAll( 'td', {'class': 'eventsHeading'}): if event.a.get('name'): ev = event.a['name'].split('_')[1] fname = event.a.text.encode( 'utf-8') fid = event.a.get('href').replace( 'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=', '') event_dict[int(ev)] = { 'name': fname, 'id': int(fid) } links = [] for td in schedules_page.findAll( 'td', {'class': 'main'}): for link in td.findAll('a'): l = link.get('href') if l and 'cPath' in l: links.append(l) schedules_data.append({ 'mcity': city_obj, 'city': city_obj.city, 'mcinema': cinema_obj, 'cinema': cinema_kid, 'schedules': set(links) }) else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) except httplib.HTTPException: pass create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) megamag = get_source_data(source, 'schedule', 'list') for obj in schedules_data: cinema_object = obj['mcinema'] for index, i in enumerate(obj['schedules']): opener = give_me_cookie() try: req3 = opener.open(urllib2.Request(i)) if req3.getcode() == 200: id_schedule = i.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '').encode('utf-8') if id_schedule not in megamag: sch_page = BeautifulSoup(req3.read(), from_encoding="utf-8") tables = sch_page.findAll('table', { 'class': 'Cinema_new_box_2_TemplateCenterPart' }, limit=1)[0] main_table = tables.findAll('table', cellpadding='4', limit=1)[0] tr = main_table.findAll('tr')[1] td = tr.findAll('strong') event_id = id_schedule.split('_')[2] film_data = event_dict.get(int(event_id)) if film_data: film_name = film_data['name'] film_name_slug = low( del_separator(del_screen_type(film_name))) film_id = film_data['id'] if film_id not in noffilms and film_name_slug.decode( 'utf-8') not in ignored: obj = films.get( str(film_id).decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_name_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[str(film_id).decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append(objt) elif not obj: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: dtime_info = td[1].text.encode( 'utf-8').split() year_info = datetime.datetime.now( ).year day_info = int(dtime_info[0]) month_low = low( dtime_info[1].replace(',', '')) month_info = int( get_month(month_low)) time_info = dtime_info[-1].replace( '(', '').replace(')', '').split(':') dtime = datetime.datetime( year_info, month_info, day_info, int(time_info[0]), int(time_info[1]), 0) SourceSchedules.objects.create( source_id=id_schedule, source_obj=source, cinema=cinema_object, film=objt, dtime=dtime, ) except httplib.HTTPException: open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('%s\n' % i) # на каждом 60 обращении к источнику делаю паузу в 2 секунды if (index + 1) % 60 == 0: time.sleep(2.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_vkinocomua_films_and_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://vkino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source) cinemas = {} for ind, i in enumerate(cinemas_data): url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='cinema-showtimes') if main: for content in main.findAll('div', {'class': 'content'}): film_tag = content.find('a', {'class': 'navi'}) film_name = film_tag.string.encode('utf-8').strip() film_slug = low(del_separator(film_name)) full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\/\d+\/', full_url) if film_id: film_id = film_id[0].replace('/','').encode('utf-8') else: film_id = film_slug full_url = '%s%s' % (source.url, full_url.lstrip('/')) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for div in content.findAll('div', {'class': 'date'}): year, month, day = div['data-date'].split('-') show = div.find_next_sibling("ul") for li in show.findAll('li'): if li.a: extra = li.a.get('href') hours, minutes = li.a.text.strip().split(':') else: extra = None hours, minutes = li.text.strip().split(':') # sale = True if extra else False dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8')) sch_id = sch_id.replace(' ', '') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = i, dtime = dtime, extra = extra, ) schedules.append(sch_id) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_cinemaarthall_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Норильск' cinema_name = 'Синема-АРТ-Холл' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://cinemaarthall.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) dates = [] url = '%spage/kino/films/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', id='datachek') for a in show_days.findAll('a'): day = a.get('href').replace('/page/kino/films/&date=', '') dates.append(day) for d in dates: url = '%spage/kino/films/&date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) for div in data.findAll('div', {'class': 'media-block'}): film_name = div.find('h3') if film_name: film_name = film_name.string.encode('utf-8') film_id = div.findAll('a', limit=1)[0].get('href').replace( '/', '').encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%spage/kino/films/%s' % (source.url, film_id) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: div_sess = div.find('div', {'class': 'filmr'}) for t in div_sess.findAll('span'): if t.string: t = t.string.split(',')[0] hours, minutes = t.split(':') day, month, year = d.split('.') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def page_parser(city_name, cinema_name, source): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url=source) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%ssessions/' % source.url today = datetime.date.today() next_week = today + datetime.timedelta(days=6) delta = next_week - today for day in range(delta.days + 1): date_obj = today + datetime.timedelta(days=day) url = '%s%s' % (main_url, date_obj) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id="section-session") if main: main = main.find('table') for tr in main.findAll('tr'): showtime, film = tr.findAll('td', limit=2) hours, minutes = showtime.string.split(':') film_a = film.findAll('a') if film_a: film_a = film_a[1] if len(film_a) > 1 else film_a[0] full_url = film_a.get('href') film_id = full_url.replace('%sfilms/', source.url).replace( '/', '').encode('utf-8') film_name = del_screen_type( film_a.get('title').encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: dtime = datetime.datetime( date_obj.year, date_obj.month, date_obj.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) return data_nof_film
def get_kinohod_schedules(): # print "BEGIN get_kinohod_schedules()" t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') opener = urllib2.build_opener() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1', } opener.addheaders = headers.items() cron_data_new = 0 cron_data_new_sale = 0 cron_data_nof = '' cron_count = 0 cron_count_sale = 0 film_list = [] film_nof_list = [] cinemas_count = 0 source = ImportSources.objects.get(url='http://kinohod.ru/') kinohod_cinemas_dict = get_source_data(source, 'cinema', 'dict') kinohod_films_dict = get_source_data(source, 'film', 'dict') today = datetime.datetime.now().date() today_add_seven_days = today + datetime.timedelta(days=6) kinohod_schedules = list( SourceSchedules.objects.filter( dtime__gte=today, source_obj=source).values_list('source_id', flat=True)) for cinema_id, cinema_obj in kinohod_cinemas_dict.iteritems(): today_temp = today while today_temp <= today_add_seven_days: today_str = today_temp.strftime("%d%m%Y") today_temp += datetime.timedelta(days=1) url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/%s/schedules?apikey=%s&date=%s' % ( cinema_id, SERVER_API_KEY, today_str) #url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/%s/schedules?apikey=%s' % (cinema_id, SERVER_API_KEY) #url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/300/schedules?apikey=%s' % SERVER_API_KEY # ------ !!!! try: try: req = opener.open(url) if req.getcode() == 200: cinemas_count += 1 json_data = req.read() data = json.loads(json_data) for i in data: film_id = str(i['movie']['id']) film_obj = kinohod_films_dict.get(film_id) if film_obj: film_list.append(film_id) else: film_nof_list.append(film_id) for s in i['schedules']: sale = s['isSaleAllowed'] cron_count += 1 if sale: cron_count_sale += 1 if film_obj: id = str(s['id']) if id not in kinohod_schedules: show_d = s['startTime'].split( 'T')[0].split('-') dtime = datetime.date( int(show_d[0]), int(show_d[1]), int(show_d[2])) hour = int(s['time'].split(':')[0]) if hour >= 0 and hour <= 5: dtime = dtime - datetime.timedelta( days=1) show_t = '%s:00' % s['time'] show_t = show_t.split(':') dtime = datetime.datetime( dtime.year, dtime.month, dtime.day, int(show_t[0]), int(show_t[1]), 0) SourceSchedules.objects.create( source_id=id, source_obj=source, cinema=cinema_obj, film=film_obj, dtime=dtime, sale=sale, ) cron_data_new += 1 if sale: cron_data_new_sale += 1 except httplib.HTTPException: open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('%s\n' % url) except (urllib2.HTTPError, urllib2.URLError): open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('urllib2***\t%s\n' % url) # cron log film_sum = len(set(film_list + film_nof_list)) end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = u'%s | %s - %s %s<br />' % (datetime.datetime.now().date(), start_time, end_time, u'Импорт сеансов киноход') cron_data += u'<br /><b>Получено</b>: %s (с продажей: %s)' % ( cron_count, cron_count_sale) cron_data += u'<br /><b>Новых</b>: %s (с продажей: %s)' % ( cron_data_new, cron_data_new_sale) cron_data += u'<br /><b>Кинотеатров</b>: %s' % cinemas_count cron_data += u'<br /><b>Фильмов</b>: %s (не идент: %s)<br />' % ( film_sum, len(set(film_nof_list))) for i in range(50): cron_data += u'- ' process_time = time.time() - t1 cron_data = u'<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_schedules.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data.encode('utf-8')) cron_success('json', source.dump, 'schedules', 'Сеансы')
def get_premierzal_schedules(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.premierzal.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cities_cinemas = {} for i in SourceCinemas.objects.select_related('city').filter( source_obj=source): if not cities_cinemas.get(i.city.source_id): cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []} cities_cinemas[i.city.source_id]['cinemas'].append(i) for k, v in cities_cinemas.iteritems(): city_url_encode = urllib.quote(v['city'].name.encode('utf-8')) for i in v['cinemas']: main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id, city_url_encode) main_req = urllib.urlopen(main_url) if main_req.getcode() == 200: data = BeautifulSoup(main_req.read()) data = data.find('div', id="films-list") if data: dates = [] for calendar in data.findAll('table', {'class': 'calendar'}): for a in calendar.findAll('a'): href = a.get('href', '') href_dict = dict(cgi.parse_qsl(href)) calendar_date = href_dict.get( u'?date', href_dict.get(u'date')) if calendar_date: dates.append({ 'date': calendar_date, 'href': href }) for ind, d in enumerate(dates): films_blocks = [] if ind == 0: films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) else: url = '%s?date=%s&city=%s&theatre=%s' % ( source.url, d['date'], city_url_encode, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) data = data.find('div', id="films-list") films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) time.sleep(random.uniform(0.8, 2.2)) for block in films_blocks: title = block.find('div', { 'class': 'title' }).find('a') film_name = title.text.encode('utf-8').strip() film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: year, month, day = d['date'].split( u'-') for tm in block.findAll( 'div', {'class': 'seanse-item'}): for t in tm.text.encode( 'utf-8').split('|'): t = re.findall( r'\d{2}\:\d{2}', t) if t: hours, minutes = t[ 0].strip().split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % ( dtime, i.source_id.encode( 'utf-8'), film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=i, dtime=dtime, ) schedules.append( sch_id) time.sleep(random.uniform(1.1, 1.8)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinobklass_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://kino-bklass.ru/') sfilm_clean(source) city_name = 'Серпухов' cinema_name = 'Кинотеатр в ТРК "Б-Класс"' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() to = today + datetime.timedelta(days=6) delta = to - today for day in range(delta.days + 1): d = today + datetime.timedelta(days=day) url = '%s?date=%s' % (source.url, d.strftime("%Y%m%d")) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.findAll('table', id='rasp', limit=1)[0] for td in table.findAll('td', colspan='10'): full_url = td.a.get('href') film_id = full_url.replace('http://kino-bklass.ru/films/', '').replace('/', '').encode('utf-8') film_name = td.a.h3.string.strip().split(' ')[0].encode( 'utf-8') film_slug = del_screen_type(low(del_separator(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: tr = td.find_next('tr') times = [] for time_tag in tr.findAll('td'): t = None if time_tag.string: t = time_tag.string.strip().encode('utf-8') if time_tag.b: t = time_tag.b.string.strip().encode( 'utf-8') if t: try: hours, minutes = t.split(':') except ValueError: try: hours, minutes = t.split('-') except ValueError: hours, minutes = t.split('^') if hours == '24': hours, minutes = (23, 59) year, month, day = str(d).split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) time.sleep(3.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_premierzal_cinemas(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cinemas = get_source_data(source, 'cinema', 'list') cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = {} for i in Cinema.objects.all(): cinemas_dict[i.code] = i ignored_cinemas = get_ignored_cinemas() data_nof_cinema = '' city = cities_dict.values()[0] body = urllib.urlencode({ 'city': city.name.encode('utf-8'), }) url = '%stheatres?%s' % (source.url, body) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) blocks = [] block1 = data.find('div', {'class': 'this_city_theatres'}) block2 = data.find('div', {'class': 'other_city_theatres'}) if block1: blocks.append(block1) if block2: blocks.append(block2) for ind, block in enumerate(blocks): for a in block.findAll('a'): cinema_name = a.text.encode('utf-8').strip().replace('"', '') cinema_id = a.get('href').replace('/theatres/', '').replace('/', '') if ind == 0: city_obj = city else: city_name, cinema_name = cinema_name.split(',') cinema_name = cinema_name.strip() city_slug = low(del_separator(city_name.strip())) city_obj = cities_dict.get(city_slug.decode('utf-8')) cinema_slug = low(del_separator(cinema_name)) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_id.decode( 'utf-8' ) not in cinemas and cinema_ig_id not in ignored_cinemas: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema = cinema_identification(cinema_slug, filter1) cin_obj = cinemas_dict.get(cinema) if cin_obj: SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cin_obj, name=cinema_name, ) cinemas.append(cinema_id.decode('utf-8')) else: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
def get_planeta_schedules(): data_nof_hall = '' source = ImportSources.objects.get(url='http://planeta-kino.com.ua/') planeta_schedules = get_source_data(source, 'schedule', 'list') planeta_cities_dict = get_source_data(source, 'city', 'dict') planeta_cinemas_dict = get_source_data(source, 'cinema', 'dict') planeta_films_dict = get_source_data(source, 'film', 'dict') nof_list = [] for i in planeta_kino_urls: xml = open( '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, i['city']), 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() for day in xml_data.findAll('day'): release_date = day['date'].encode('utf-8') for show in day.findAll('show'): cinema_id = show['theatre-id'].encode('utf-8') city_id = cinema_id.split('-')[1].encode('utf-8') city = planeta_cities_dict.get(city_id) cinema = planeta_cinemas_dict.get(cinema_id) film_id = show['movie-id'] film = planeta_films_dict.get(film_id) if city and cinema and film: time_data = show['time'].encode('utf-8') technology = show['technology'].encode('utf-8') hall_id = show['hall-id'].encode('utf-8') d = release_date.split('-') t = time_data.split(':') dtimedate = datetime.datetime(int(d[0]), int(d[1]), int(d[2]), int(t[0]), int(t[1])) planeta_id = '%s%s%s%s%s' % (dtimedate, hall_id, cinema_id, city_id, film_id) planeta_id = planeta_id.replace(' ', '') id = '%s%s%s' % (hall_id, cinema_id, city_id) if planeta_id not in planeta_schedules and id not in nof_list: # идентификация зала hall_obj = Hall.objects.filter( name__name=hall_id, cinema=cinema.cinema).distinct('pk') # если нашел if hall_obj.count() == 1: # все объекты идентифицированны, добавляю к идентифицированным SourceSchedules.objects.get_or_create( source_id=planeta_id, source_obj=source, defaults={ 'source_id': planeta_id, 'source_obj': source, 'film': film, 'cinema': cinema, 'hall': hall_obj[0].kid, 'dtime': dtimedate, }) # если зал ненайден else: nof_list.append(id) # если такого тега нет в ненайденных, то добавляю data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % ( city.name.encode('utf-8'), city.city.kid, cinema.name.encode('utf-8'), cinema.cinema.code, hall_id, hall_id, id) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) cron_success('xml', source.dump, 'schedules', 'Сеансы')
def get_planeta_cities_cinemas(): ''' Получение xml данных сеансов от PlanetaKino ''' source = ImportSources.objects.get(url='http://planeta-kino.com.ua/') planeta_cities_dict = get_source_data(source, 'city', 'dict') planeta_cinemas = get_source_data(source, 'cinema', 'list') data_nof_city = '' data_nof_cinema = '' for i in planeta_kino_urls: city_name = i['city_name'] city_slug = low(del_separator(city_name)) city_id = i['city'] req = urllib.urlopen(i['url']) if req.getcode() == 200: f = open( '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, city_id), 'w') f.write(str(req.read())) f.close() city_obj = planeta_cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: city_kid = city_obj.city.kid cinema_name = 'Планета Кино IMAX' cinema_slug = low(del_separator(cinema_name)) cinema_id = 'imax-%s' % i['city'] if i[ 'city'] == 'kiev' else 'pk-%s' % i['city'] if cinema_id not in planeta_cinemas: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__kid': city_kid } cinema_kid = cinema_identification(cinema_slug, filter1) if cinema_kid: cinema = Cinema.objects.get(code=cinema_kid) SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) else: tags = 'slug="%s" city_kid="%s"' % (cinema_slug, city_kid) if tags not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" city="%s" slug="%s" city_kid="%s"></cinema>' % ( cinema_name, city_name, cinema_slug, city_kid) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('xml', source.dump, 'cities', 'Города') cron_success('xml', source.dump, 'cinemas', 'Кинотеатры')
def get_okinoua_schedules(): """ Парсинг сеансов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем словарь идентифицированных фильмов OkinoUA okinoua_films_dict = get_source_data(source, 'film', 'dict') # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') # Получаем словарь со списком идентифицированных кинотеатров OkinoUA okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict') # Получаем список идентифицированных сенсов OkinoUA okinoua_schedules = get_source_data(source, 'schedule', 'list') counter1 = 0 for city_id, city_obj in okinoua_cities_dict.iteritems(): counter1 += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) dates = [] if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # если в городе есть сеансы item = page.find('div', {'class': 'item0'}) if item: # получаю даты на которые есть расписание date_div = page.find('div', id='afisha-date') dates = [i.get('href').strip() for i in date_div.findAll('a')] counter = 0 for date in dates: counter += 1 url2 = '%s%s' % (url, date) req2 = urllib.urlopen(url2) if req2.getcode() == 200: page2 = BeautifulSoup(req2.read(), from_encoding="utf-8") for div in page2.findAll('div', {'class': 'item0'}): cinema_tag = div.find('h3') cinema_id = cinema_tag.a.get('href').replace( '/', '').encode('utf-8') cinema_obj = okinoua_cinemas_dict.get(cinema_id) if cinema_obj: for film in div.findAll('div', {'class': 'item2'}): if film.div.div.a: film_name = film.div.div.a.string.encode( 'utf-8') film_id = film.div.div.a.get('href').replace( '/film/', '').replace('/', '').encode('utf-8') else: film_name = film.div.div.string.strip().encode( 'utf-8') film_id = None film_name_slug = low( del_separator(del_screen_type(film_name))) if not film_id: film_id = film_name_slug film_obj = okinoua_films_dict.get(film_id) if film_obj: showtime = film.find('div', {'class': 'showtime'}) for time_tag in showtime.findAll('span'): hours, minutes = time_tag.string.encode( 'utf-8').split(':') year, month, day = date.replace( '?date=', '').split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes), 0) id = '%s%s%s%s' % (dtime, cinema_id, city_id.encode('utf-8'), film_id) id = id.replace(' ', '') if id.decode( 'utf-8') not in okinoua_schedules: SourceSchedules.objects.create( source_id=id, source_obj=source, cinema=cinema_obj, film=film_obj, dtime=dtime, ) okinoua_schedules.append(id) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) if counter1 % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) cron_success('html', 'okinoua', 'schedules', 'Сеансы')
def get_okinoua_films(): """ Парсинг фильмов Украины """ xml = open('%s/dump_okinoua_nof_film.xml' % settings.NOF_DUMP_PATH, 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() films_slugs = [] for i in xml_data.findAll('film'): slug = i.get('slug_ru') films_slugs.append(slug) source = ImportSources.objects.get(url='http://www.okino.ua/') data_nof_films = '' not_founded_films = [] # Получаем словарь идентифицированных фильмов OkinoUA okinoua_films = get_source_data(source, 'film', 'list') # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') # Получаем словарь со списком идентифицированных кинотеатров OkinoUA okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict') counter = 0 for city_id, city_obj in okinoua_cities_dict.iteritems(): counter += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) dates = [] if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") for div in page.findAll('div', {'class': 'item0'}): for film in div.findAll('div', {'class': 'item2'}): alt_name = None if film.div.div.a: film_name = film.div.div.a.string.encode('utf-8') film_a = film.div.div.a.get('href') film_id = film_a.replace('/film/', '').replace( '/', '').encode('utf-8') full_url = '%sfilm/%s' % (source.url, film_id) req_name = urllib.urlopen(full_url) if req_name.getcode() == 200: filmpage = BeautifulSoup(req_name.read(), from_encoding="utf-8") title = filmpage.find('div', {'class': 'item'}) if title.h4: alt_name = title.h4.text.encode('utf-8') alt_name = re.sub(r'\(.*?\)', '', alt_name).strip() else: film_name = film.div.div.string.strip().encode('utf-8') film_id = None film_name_slug = low( del_separator(del_screen_type(film_name))) if not film_id: film_id = film_name_slug.decode('utf-8') if film_id not in okinoua_films: kid, info = film_identification(film_name_slug, None, {}, {}, source=source) if kid: film_obj, created = SourceFilms.objects.get_or_create( source_id=film_id, source_obj=source, defaults={ 'source_id': film_id, 'source_obj': source, 'name': film_name, 'kid': kid, 'name_alter': alt_name, }) else: slug_tag = 'slug_ru="%s"' % film_name_slug if slug_tag not in data_nof_films and film_name_slug.decode( 'utf-8') not in films_slugs: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) okinoua_films.append(film_id) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('okinoua_nof_film', settings.NOF_DUMP_PATH, xml_data) cron_success('html', 'okinoua', 'films', 'Фильмы')
def get_okinoua_cinemas(): """ Парсинг кинотеатров Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список идентифицированных кинотеатров OkinoUA cinemas_ids = get_source_data(source, 'cinema', 'list') data_nof_cinema = '' # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') cinemas = Cinema.objects.all() cinemas_dict = {} for i in cinemas: cinemas_dict[i.code] = i counter = 0 # Открываем ссылку, если она доступна и считываем ее BeautifulSoup'ом for city_id, city_obj in okinoua_cities_dict.iteritems(): counter += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for div in page.findAll('div', {'class': 'item0'}): cinema_tag = div.find('h3') cinema_id = cinema_tag.a.get('href').replace('/', '') cinema_name = cinema_tag.a.string.encode('utf-8') cinema_slug = low(del_separator(cinema_name)) if cinema_id not in cinemas_ids: filter = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification(cinema_slug, filter) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('okinoua_nof_cinema', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', 'okinoua', 'cinemas', 'Укр. кинотеатры')
def get_kinohod_cinemas(): # print "BEGIN get_kinohod_cinemas()" t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 main_url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas?apikey=%s' % SERVER_API_KEY source = ImportSources.objects.get(url='http://kinohod.ru/') kinohod_cinemas = get_source_data(source, 'cinema', 'list') kinohod_cities_dict = get_source_data(source, 'city', 'dict') cinemass = Cinema.objects.all() cinemass_dict = {} for i in cinemass: cinemass_dict[i.code] = i count = 0 data_nof_cinema = '' for cid, kinohod_city in kinohod_cities_dict.iteritems(): try: url = '%s&city=%s' % (main_url, cid) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 id = str(i['id']).decode('utf-8') if id not in kinohod_cinemas: name = i['title'] name_slug = del_screen_type(name.encode('utf-8')) name_slug = low(del_separator(name_slug)) short_name = i['shortTitle'] short_name_slug = del_screen_type( short_name.encode('utf-8')) short_name_slug = low(del_separator(short_name_slug)) filter1 = { 'name__name': name_slug, 'name__status': 2, 'city__id': kinohod_city.city_id } filter2 = { 'name__name': short_name_slug, 'name__status': 2, 'city__id': kinohod_city.city_id } cinema_kid = cinema_identification( short_name_slug, filter1, filter2) cin_obj = cinemass_dict.get(cinema_kid) if cin_obj: SourceCinemas.objects.create( source_id=id, source_obj=source, city=kinohod_city, cinema=cin_obj, name=name, name_alter=short_name, ) cron_data_new += '%s<br />' % short_name.encode( 'utf-8') else: count += 1 name_city = kinohod_city.name data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( short_name.encode('utf-8'), short_name_slug, name_city.encode('utf-8'), kinohod_city.city.kid) cron_data_nof += '%s<br />' % short_name.encode( 'utf-8') kinohod_cinemas.append(id) except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') data_nof_cinema += '<sum>%s</sum>' % count create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт кинотеатров киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_cinemas.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'cinemas', 'Кинотеатры')
def get_rottentomatoes_films(everyday=True): def get_critic(block): critic = block.findAll('div', id="scoreStats", limit=1) if critic: critic = critic[0].findAll('div') average = critic[0].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() reviews = critic[1].findAll('span', limit=2)[1].text.strip() fresh = critic[2].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() rotten = critic[3].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() return '%s;%s;%s;%s' % (average.replace( '/10', ''), reviews, fresh, rotten) else: return 'N/A;0;0;0' ''' critic = block.findAll('p', {'class': 'critic_stats'}, limit=1)[0] average, reviews = critic.findAll('span', limit=2) try: fresh, rotten = reviews.next_sibling.next_sibling.encode('utf-8').strip().split(' | ') except AttributeError: return 'N/A;0;0;0' fresh = fresh.replace('Fresh:','').strip() rotten = rotten.replace('Rotten:','').strip() average = average.string.encode('utf-8').split('/')[0] reviews = reviews.string.encode('utf-8') return '%s;%s;%s;%s' % (average, reviews, fresh, rotten) ''' source = ImportSources.objects.get(url='http://www.rottentomatoes.com/') sfilm_clean(source) noffilms = [] data_nof_film = '' filter = {'source_obj': source} if everyday: today = datetime.datetime.today().date() day7 = today + datetime.timedelta(days=7) today = today - datetime.timedelta(days=30) filter['text__gte'] = today filter['text__lt'] = day7 exists = get_source_data(source, 'film', 'list') films = {} source_films = SourceFilms.objects.filter(**filter) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ignored = get_ignored_films() opener = urllib2.build_opener() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1', } opener.addheaders = headers.items() updated = [] for k, f in films.items(): film_url = '%s%s' % (source.url, k) req = opener.open(film_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") extra = get_critic(data) f.extra = extra f.save() updated.append(k) time.sleep(1) u = 'http://www.rottentomatoes.com/api/private/v1.0/m/list/find?page=1&limit=50&type=opening&minTomato=0&maxTomato=100&minPopcorn=0&maxPopcorn=100&services=&genres=1%3B2%3B4%3B5%3B6%3B8%3B9%3B10%3B11%3B13%3B14%3B18&sortBy=popularity&certified=false' req = opener.open(u) if req.getcode() == 200: data = json.loads(req.read(), encoding="latin-1") for i in data['results']: title = i['title'].encode('utf-8') title_slug = low(del_separator(title)) url = i['url'].lstrip('/') full_url = '%s%s' % (source.url, url) if url not in exists and url not in noffilms: if title_slug.decode( 'utf-8') not in ignored and url not in updated: time.sleep(1) req2 = opener.open(full_url) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") year_block = data2.find('h1', {'class': 'title hidden-xs'}) if not year_block: year_block = data2.find('h1', id='movie-title') year_tmp = year_block.find('span', { 'class': 'h3 year' }).text.encode('utf-8') year = int(year_tmp.replace('(', '').replace(')', '')) release_date = data2.find('td', itemprop="datePublished") if release_date: release_date = release_date.get('content') extra = get_critic(data2) obj = films.get(url) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid obj.extra = extra obj.save() else: kid, info = film_identification(None, title_slug, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(url, kid, source, title, txt=release_date, extra=extra) films[url] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( title, title_slug, None, None, url.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(url) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы, рейтинг')
def get_kinohod_films(): # print "BEGIN get_kinohod_films()" ignored = get_ignored_films() t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 noffilms = [] source = ImportSources.objects.get(url='http://kinohod.ru/') sfilm_clean(source) kinohod_cities = get_source_data(source, 'city', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_films = '' main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY for city_id in kinohod_cities: try: url = '%s&city=%s' % (main_url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 film_id = str(i['id']).decode('utf-8') year = int( i['productionYear']) if i['productionYear'] else None name_ru = i['title'].encode('utf-8') name_ru_slug = low(del_separator(del_screen_type(name_ru))) full_url = '%smovie/%s/' % (source.url, film_id) name_en = None name_en_slug = None if i['originalTitle']: name_en = i['originalTitle'].encode('utf-8') name_en_slug = low( del_separator(del_screen_type(name_en))) if year and name_ru_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: try: if obj: kid = obj.kid else: kid, info = film_identification( name_ru_slug, name_en_slug, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name_ru, name_alt=name_en, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) cron_data_new += '%s<br />' % name_ru elif not obj: if not name_en: name_en = '*' name_en_slug = '*' data_nof_films += xml_noffilm( name_ru, name_ru_slug, name_en, name_en_slug, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) cron_data_nof += '%s<br />' % name_ru except db.backend.Database._mysql.OperationalError: pass except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт фильмов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'films', 'Фильмы')
def get_oreanda_and_spartak(): ignored = get_ignored_films() city_name = 'Ялта' city_slug = low(del_separator(city_name)) xdata = ( { 'url': 'http://yaltakino.com/Oreanda/', 'eng': 'Oreanda', 'ru': 'Ореанда' }, { 'url': 'http://yaltakino.com/Spartak/', 'eng': 'Spartak', 'ru': 'Спартак' }, ) for data in xdata: data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url=data['url']) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_name = data['ru'] cinema_eng = data['eng'] cinema_slug = low(del_separator(cinema_name)) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%sschedule/' % source.url req = urllib.urlopen(main_url) if req.getcode() == 200: data = BeautifulSoup(req.read()) main = data.find('td', {'class': 'contentplaceholder'}) for div in main.findAll('div', {'class': 'scheduleDayCaption'}): sess_date, sess_day = div.text.split(' / ') day, month = sess_date.split() month = get_month(month.encode('utf-8')) year = datetime.datetime.now().year table = div.find_next('table') for tr in table.findAll('tr'): if tr.find('td', {'class': 'scheduleTime'}): hour, minute = tr.find('td', { 'class': 'scheduleTime' }).text.split(':') film = tr.find('a', {'class': 'scheduleLink'}) film_url = film.get('href') full_url = 'http://yaltakino.com%s' % film_url film_id = film_url.replace('/%s/?filmid=' % cinema_eng, '') film_name = del_screen_type( film.text.encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id.encode( 'utf-8') not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id.encode('utf-8')) if objt: dtime = datetime.datetime( year, int(month), int(day), int(hour), int(minute)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinohod_cities(): # print "BEGIN get_kinohod_cities()" t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 url = 'http://www.kinohod.ru/api/rest/partner/v1/cities?apikey=%s' % SERVER_API_KEY source = ImportSources.objects.get(url='http://kinohod.ru/') req = urllib.urlopen(url) if req.getcode() == 200: kinohod_cities = get_source_data(source, 'city', 'list') data_nof_city = '' json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 id = str(i['id']).decode('utf-8') if id not in kinohod_cities: alias = i['alias'] name = i['name'].encode('utf-8') name_slug = del_screen_type(low(del_separator(name))) city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, name_alter=alias, ) cron_data_new += '%s<br />' % name else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) cron_data_nof += '%s<br />' % name kinohod_cities.append(id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s<br />' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт городов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_cities.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'cities', 'Города')
def get_kinomagnat_schedules(): ignored = get_ignored_films() data_nof_film = '' data_nof_hall = '' data_nof_cinema = '' noffilms = [] nofhalls = [] city_name = 'Киев' cinema_name = 'Магнат' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinomagnat.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') halls = get_source_data(source, 'hall', 'dict') city = City.objects.get(name__name=city_name, name__status=1) try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: cinema = None data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city.kid) if cinema: city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) cinema_kid = cinema.code city_kid = city.kid today = datetime.date.today() url = '%sseans.html?device=iphone' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) div = data.find('div', {'class': 'contentpaneopen'}) for table in div.findAll('table'): try: day, month = table.find_all_previous("p", limit=2)[1].text.strip().split() except ValueError: try: day, month = table.find_all_previous("p", limit=3)[2].text.strip().split() except ValueError: day, month = table.find_all_previous("p", limit=4)[3].text.strip().split() month = get_month_ua(low(month.encode('utf-8'))) date_sch = datetime.date(today.year, month, int(day)) hall_name = table.findAll('tr', limit=1)[0].text.strip().encode('utf-8') hall_name_slug = low(del_separator(hall_name)) if hall_name_slug not in nofhalls: hall_obj = halls.get(hall_name_slug) if not hall_obj: halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk') if halls_obj.count() == 1: hall_kid = halls_obj[0].kid hall_obj = SourceHalls.objects.create( source_id=hall_name_slug, source_obj=source, cinema=cinema_obj, name=hall_name, kid=hall_kid, ) halls[hall_name_slug] = hall_obj else: id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug) id = id.replace(' ', '') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id) nofhalls.append(hall_name_slug) if hall_obj: for ind, tr in enumerate(table.findAll('tr')): if ind != 0: showtime, film_data = tr.findAll('td', limit=2) hour, minute = showtime.text.strip().encode('utf-8').split(':') dtime = datetime.datetime(date_sch.year, date_sch.month, date_sch.day, int(hour), int(minute)) a = film_data.find('a') film_id = a.get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sch_id = '%s%s%s' % (dtime, hall_obj.id, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, hall=hall_obj.kid, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_arsenalclub_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Нефтекамск' cinema_name = 'Арсенал' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://arsenal-club.com/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() future = today + datetime.timedelta(days=6) delta = future - today for d in range(delta.days + 1): date = today + datetime.timedelta(days=d) flag = False url = '%skino/?rasdel=kino&day=%s#daybox' % (source.url, date.strftime('%d.%m')) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" for table in data.findAll('table', width="100%", cellpadding="3", cellspacing="1", bgcolor="#393939"): trs = table.findAll('tr', bgcolor="#292929") if len(trs) == 0: flag = True else: for tr in trs: times, film, price = tr.findAll('td') full_url = film.a.get('href').encode( 'utf-8') if film.a and film.a.get('href') else None if full_url: film_name = film.a.text.encode('utf-8').strip() else: film_name = film.text.encode('utf-8').strip() film_slug = del_screen_type( low(del_separator(film_name))) if full_url: film_id = full_url.replace( 'http://www.kinopoisk.ru/film/', '').encode('utf-8') else: film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: hours, minutes = times.string.split(':') dtime = datetime.datetime( date.year, date.month, date.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) if flag: break create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_okinoua_distributors(request): form = OkinoUploadForm() if request.POST: form = OkinoUploadForm(request.POST, request.FILES) if form.is_valid(): source = ImportSources.objects.get(url='http://www.okino.ua/') with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] today = datetime.date.today() films_dict = get_source_data(source, 'film', 'dict') releases = SourceReleases.objects.select_related('film').filter( film__source_obj=source, release__gte=today) releases_dict = {} for i in releases: releases_dict[i.film.source_id] = i data_nof_films = '' data = request.FILES['file'].read() html_data = BeautifulSoup(data, from_encoding="utf-8") main = html_data.find('div', {'class': 'release_list'}) year = datetime.date.today().year first_h3 = main.findAll('h3', limit=1)[0] for div in first_h3.find_next_siblings(): film_tag = div.find('p', {'class': 'name'}) flag = False if film_tag: flag = True film_tag = film_tag.a film_name = film_tag.string.encode('utf-8') full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\d+\/$', full_url)[0].replace( '/', '').encode('utf-8') film_slug = low(del_separator(film_name)) film_year = div.find('span', { 'class': 'y' }).string.encode('utf-8').replace('(', '').replace(')', '') full_url = 'http://www.okino.ua%s' % full_url release_day = int( div.find('span', { 'class': 'day' }).string) release_month = div.find('span', { 'class': 'month' }).string.encode('utf-8') release_month = get_month(release_month) release_date = datetime.date(year, int(release_month), release_day) film_obj = films_dict.get(film_id) if not film_obj: kid, info = film_identification(film_slug, None, {}, {}, year=film_year, source=source) if kid: film_obj = SourceFilms.objects.create( source_id=film_id, source_obj=source, name=film_name, kid=kid, year=film_year, ) else: temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in films_slugs and temp_film_slug not in ignored: films_slugs.append(film_slug.decode('utf-8')) data_nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) if film_obj: for p in div.findAll('p'): if p.string: text = p.string.encode('utf-8') if 'Дистрибьютор:' in text: distr = text.replace( 'Дистрибьютор: ', '').decode('utf-8') release_obj = releases_dict.get( film_id) if release_obj: if release_obj.release != release_date or release_obj.distributor != distr: release_obj.release = release_date release_obj.distributor = distr release_obj.save() else: release_obj = SourceReleases.objects.create( source_obj=source, film=film_obj, release=release_date, distributor=distr, ) releases_dict[ film_id] = release_obj if div.string: year = int( re.findall(r'\d+$', div.string.encode('utf-8'))[0]) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace( '</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) return HttpResponseRedirect(reverse('admin_source_releases_show')) return render_to_response('release_parser/okinoua_upload.html', {'form': form}, context_instance=RequestContext(request))
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_cinema5_schedules(): data_nof_cinema = '' data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://cinema5.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') data = [ { 'city': 'Нижнекамск', 'url': '%snk' % source.url }, { 'city': 'Оренбург', 'url': '%soren' % source.url }, { 'city': 'Саратов', 'url': '%ssaratov' % source.url }, { 'city': 'Уфа', 'url': '%sufa' % source.url }, { 'city': 'Чебоксары', 'url': '%scheby' % source.url }, ] params = ['today', 'tomorrow', '+2days'] cinema_name = 'Синема 5' cinema_slug = low(del_separator(cinema_name)) for i in data: city_slug = low(del_separator(i['city'])) city = City.objects.get(name__name=i['city'], name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': i['city'], }) cinema = None try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, i['city'], city_obj.city.kid) if cinema: cinema_id = '%s_%s' % (cinema_slug, city_slug) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_id, source_obj=source, defaults={ 'source_id': cinema_id, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) for param in params: url = '%s?date=%s' % (i['url'], param) req = urllib.urlopen(url) if req.getcode() == 200: page_data = BeautifulSoup(req.read()) divs = page_data.find('div', {'class': 'content clearfix'}) showdate = divs.find('h1') if showdate: showdate = showdate.string.encode('utf-8') day, month, year = showdate.replace( 'Расписание на ', '').strip().split('.') for div in divs.findAll('div', {'class': 'show-wrapper'}): film_name = div.find('div', { 'class': 'title' }).string.encode('utf-8') film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: for span in div.findAll( 'span', {'class': 'time'}): hours, minutes = span.string.strip( ).split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_id, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')