def imdb_film_ident(): source = ImportSources.objects.get(url='http://www.imdb.com/') films = Films.objects.filter(kid=None) films_ids = [i.imdb_id for i in films] exist_films = Film.objects.using('afisha').filter(idalldvd__in=films_ids) exist_ids = {} for i in exist_films: exist_ids[i.idalldvd] = i.id data_nof_film = '' for i in films: name = None for j in i.name.filter(status=1, language__id=2): name = j.name.encode('utf-8') slug = low(del_separator(name)) kid = exist_ids.get(long(i.imdb_id)) if kid: i.kid = kid i.save() else: full_url = '%stitle/tt%s/' % (source.url, i.imdb_id) data_nof_film += xml_noffilm(name, slug, None, None, i.imdb_id, 'Фильм не найден', full_url.encode('utf-8'), source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films_ident', 'Идентификация')
def nowru_ident(): source = ImportSources.objects.get(url='http://www.now.ru/') ignored = get_ignored_films() data_nof_film = '' nowru_data = Nowru.objects.filter(kid=None) for i in nowru_data: name_ru_slug = low(del_separator(i.name_ru.encode('utf-8'))) if name_ru_slug.decode('utf-8') not in ignored: name_en_slug = low(del_separator(i.name_en.encode('utf-8'))) kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, year=i.year, source=source) if kid: i.kid = kid i.save() else: if 'slug="%s"' % name_ru_slug not in data_nof_film: name_ru = i.name_ru.encode('utf-8') name_en = i.name_en.encode('utf-8') data_nof_film += xml_noffilm(name_ru, name_ru_slug, name_en, name_en_slug, i.nowru_id, info, None, source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def raspishi_relations(): source = ImportSources.objects.get(url='http://распиши.рф/') ignored = get_ignored_films() data_nof_film = '' domain = u'распиши.рф' url = 'http://%s/getfilmxml.php' % domain.encode('idna') req = urllib.urlopen(url) if req.getcode() == 200: films_rid = list( RaspishiRelations.objects.exclude(kid=0).values_list('rid', flat=True)) xml_data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in xml_data.findAll('movie'): id = int(i['id']) if id not in films_rid: name_ru = i.find('name').text.encode('utf-8') name_en = i.find('nameeng').text.encode('utf-8') name_ru = re.sub(r'\(.*?\)', '', name_ru).strip() name_en = re.sub(r'\(.*?\)', '', name_en).strip() name_slug = low(del_separator(del_screen_type(name_ru))) name_en_slug = low(del_separator(del_screen_type(name_en))) if name_slug.decode('utf-8') not in ignored: try: kid, info = film_identification(name_slug, None, {}, {}, source=source) if kid: created = RaspishiRelations.objects.create( rid=id, kid=kid, name_ru=name_ru, name_en=name_en, ) else: data_nof_film += xml_noffilm( name_ru, name_slug, name_en, name_en_slug, id, info, None, source.id) except db.backend.Database._mysql.OperationalError: pass create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Укр. сеансы')
def film_kinometro_ident(): ''' Идентификация фильмов кинометро ''' source = ImportSources.objects.get(url='http://www.kinometro.ru/') xml_nof_data = '' distr_nof_data = '' nof_distributors = [] noffilms = [] sfilm_clean_kinometro() ignored = get_ignored_films() films = {} source_films = ReleasesRelations.objects.select_related('release').all() for i in source_films: films[i.release.film_id] = i fdict = get_all_kinometro_films(source_films) exe_releases = list(ReleasesRelations.objects.all().values_list('release', flat=True)) releases = Releases.objects.exclude(id__in=exe_releases) for release in releases: # получение, приведение к нужному виду названий фильмов film_name_ru = release.name_ru.encode('utf-8') film_name_en = release.name_en.encode('utf-8') slug_name_ru = low(del_separator(del_screen_type(film_name_ru))) slug_name_en = low(del_separator(del_screen_type(film_name_en))) if not slug_name_ru: slug_name_ru = slug_name_en film_name_ru = film_name_en else: if not slug_name_en: slug_name_en = slug_name_ru film_name_en = film_name_ru # получаю данные дистрибьюторов, привожу к нужному виду, получаю объект дистрибьютор для иднтифик. фильма dlist = [] distr_dict = [{ 'name': release.distributor1, 'id': release.distributor1_id }, { 'name': release.distributor2, 'id': release.distributor2_id }] distr_temp_data = '' distr_data = '' for i in distr_dict: if i['name']: distr_name = i['name'].encode('utf-8').replace('&', '&') distr_alt = i['id'].encode('utf-8').replace( '&', '&') if i['id'] else '' distr_slug = low(del_separator(distr_name)) distr_temp_data += '<distributor value="%s" code="%s" kid=""></distributor>' % ( distr_name, distr_alt) distr, status = distributor_identification( distr_name, distr_slug) if distr: dlist.append(distr.kid) kid = distr.kid if distr.kid else 0 distr_data += '<distributor value="%s" code="%s" kid="%s"></distributor>' % ( distr_name, distr_alt, kid) else: if distr_slug.decode('utf-8') not in nof_distributors: distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % ( distr_name, distr_slug, distr_alt) nof_distributors.append(distr_slug.decode('utf-8')) if not distr_data: distr_data = distr_temp_data ru_ignore = False if slug_name_ru.decode('utf-8') in ignored: ru_ignore = True en_ignore = False if slug_name_en.decode('utf-8') in ignored: en_ignore = True if not ru_ignore and not en_ignore: nof_flag = True if dlist: # определяем фильтры для дальнейшей выборки d1, d2 = (dlist[0], dlist[1]) if len(dlist) > 1 else (dlist[0], None) film_id = release.film_id if film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: skip = False if obj: kid = obj.release.film_kid else: try: kid, info = film_identification(slug_name_ru, slug_name_en, d1, d2, source=source) except db.backend.Database._mysql.OperationalError: skip = True if not skip: objt = None if kid: create_new, objt = unique_func(fdict, kid, obj, kinometro=True) if create_new: new = ReleasesRelations.objects.create( film_kid=kid, release=release, distributor_kid=dlist[0], rel_dtime=datetime.datetime.now(), ) films[film_id] = new elif not obj: xml_nof_data += xml_noffilm( film_name_ru, slug_name_ru, film_name_en, slug_name_en, release.film_id, info, release.url.encode('utf-8'), source.id) noffilms.append(film_id) else: info = 'Нет такого дистрибьютора' xml_nof_data += xml_noffilm(film_name_ru, slug_name_ru, film_name_en, slug_name_en, release.film_id, info, release.url.encode('utf-8')) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % distr_nof_data) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % xml_nof_data) cron_success('html', source.dump, 'releases', 'Идентификация фильмов и дистр.')
def get_kinobklass_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://kino-bklass.ru/') sfilm_clean(source) city_name = 'Серпухов' cinema_name = 'Кинотеатр в ТРК "Б-Класс"' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() to = today + datetime.timedelta(days=6) delta = to - today for day in range(delta.days + 1): d = today + datetime.timedelta(days=day) url = '%s?date=%s' % (source.url, d.strftime("%Y%m%d")) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.findAll('table', id='rasp', limit=1)[0] for td in table.findAll('td', colspan='10'): full_url = td.a.get('href') film_id = full_url.replace('http://kino-bklass.ru/films/', '').replace('/', '').encode('utf-8') film_name = td.a.h3.string.strip().split(' ')[0].encode( 'utf-8') film_slug = del_screen_type(low(del_separator(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: tr = td.find_next('tr') times = [] for time_tag in tr.findAll('td'): t = None if time_tag.string: t = time_tag.string.strip().encode('utf-8') if time_tag.b: t = time_tag.b.string.strip().encode( 'utf-8') if t: try: hours, minutes = t.split(':') except ValueError: try: hours, minutes = t.split('-') except ValueError: hours, minutes = t.split('^') if hours == '24': hours, minutes = (23, 59) year, month, day = str(d).split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) time.sleep(3.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def cinemate_cc_soon(): ''' login = cinemate_cc_login() if login['error']: return HttpResponse(str(login['error'])) else: opener = login['opener'] source = login['source'] ''' source = ImportSources.objects.get(url='http://cinemate.cc/') opener = give_me_cookie() ignored = get_ignored_films() data_nof_film = '' noffilms = [] sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[int(i.source_id)] = i fdict = get_all_source_films(source, source_films) send_msg = False for main_url in ('%smovies/soon' % source.url, '%smovies/cinema' % source.url): req = opener.open(urllib2.Request(main_url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") nav = data.find('div', {'class': "navigation"}) nav_link = nav.findAll('a')[-1] last_page = int(nav_link.get('href').split('?page=')[-1]) if last_page > 10: last_page = 10 film_list = get_cinemate_cc_film(data, source, ignored, noffilms) for page in xrange(2, (last_page + 1)): time.sleep(random.uniform(1.0, 2.5)) url = '%s?page=%s' % (main_url, page) try: req = opener.open(urllib2.Request(url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") film_list += get_cinemate_cc_film(data, source, ignored, noffilms) except urllib2.HTTPError: pass for i in film_list: obj = films.get(i['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(i['slug'], None, {}, {}, year=i['year'], source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(i['id'], kid, source, i['name'], year=i['year'], txt=datetime.datetime.now().date(), extra='new') films[i['id']] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) send_msg = True elif not obj: data_nof_film += xml_noffilm(i['name'], i['slug'], None, None, i['id'], info, i['url'].encode('utf-8'), source.id) noffilms.append(i['id']) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы в сети') if send_msg: current_site = DjangoSite.objects.get(domain='kinoinfo.ru') msg_from = Profile.objects.get(user__last_name='SYSTEM') msg_to = Profile.objects.get( accounts__login='******') # [email protected] msg = 'В сети появились новые фильмы <a href="http://kinoinfo.ru/torrents/listing/%s/" target="_blank">http://kinoinfo.ru/torrents/listing/%s/</a>' % ( source.id, source.id) try: dialog_exist = DialogMessages.objects.filter( readers__user=msg_to, readers__message__autor=msg_from).order_by('-id')[0] except IndexError: dialog_exist = None reader_type = '1' msg_obj = News.objects.create( title='Сообщение', text=msg, autor=msg_from, site=current_site, subdomain='0', reader_type='1', ) reader = NewsReaders.objects.create(user=msg_to, status='0', message=msg_obj) if dialog_exist: dialog_exist.readers.add(reader) else: dialog_obj = DialogMessages() dialog_obj.save() dialog_obj.readers.add(reader)
def get_kinoteatr_data(opener, date, city_obj): nof_films = '' nof_cinemas = '' url = '%sru/main/bill/order/cinemas/date/%s.phtml' % (source.url, date['str']) req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='news_page') if main: if main.find('center', {'class': 'xErr'}): return nof_films, nof_cinemas, 'error' for cinema_tag in main.findAll('span', id='afishaKtName'): cinema_name_block = cinema_tag.findAll('a', limit=1)[0] cinema_name = cinema_name_block.text.encode( 'utf-8').replace('Кинотеатр', '') cinema_slug = low( del_separator(del_screen_type(cinema_name))) cinema_name = cinema_name.replace('"', "'").replace( '&', '&').strip() cinema_id = cinema_name_block.get('href').replace( '.phtml', '') if 'cinema_id' in cinema_id: cinema_id = cinema_id.replace( 'http://kino-teatr.ua/ru/main/cinema/cinema_id/', '').encode('utf-8') else: cinema_id = re.findall(r'\d+$', cinema_id)[0] if cinema_id not in nofcinemas: cinema_obj = cinemas_dict.get(str(cinema_id)) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[str(cinema_id)] = cinema_obj except Cinema.DoesNotExist: pass else: try: name_city = city_obj.name.encode('utf-8') except UnicodeDecodeError: name_city = city_obj.name nof_cinemas += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, name_city, city_obj.city.kid) nofcinemas.append(cinema_id) if cinema_obj: films_block = cinema_tag.find_next_sibling('div') for film_block in films_block.findAll( 'div', id='afishaItem'): film_name = film_block.find( 'div', {'class': 'filmName'}) full_url = film_name.a.get('href').encode( 'utf-8') if film_name.a.text: film_name = film_name.a.text.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = full_url.replace( 'http://kino-teatr.ua/film/', '').replace('.phtml', '').encode('utf-8') if film_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = True if obj and obj.rel_ignore else False if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) if not obj: if kid: uk_url = '%suk/film/%s' % ( source.url, film_id) uk_req = opener.open( urllib2.Request( uk_url)) if uk_req.getcode() == 200: uk_data = BeautifulSoup( uk_req.read( ).decode('utf-8'), from_encoding= "utf-8") uk_name = uk_data.find( 'div', { 'class': 'myriadFilm' }).text.encode( 'utf-8') uk_text = uk_data.find( 'div', itemprop= 'description') uk_text_data = uk_text.findAll( 'p', limit=1) if uk_text_data: uk_text = uk_text_data[ 0].text.encode( 'utf-8') else: uk_text = uk_text.text.encode( 'utf-8').strip( ) uk_text = uk_text.replace( 'редактирование синопсиса', '').strip() obj = create_sfilm( film_id, kid, source, uk_name, txt=uk_text) films[film_id] = obj if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append( obj) else: if film_slug.decode( 'utf-8' ) not in films_slugs: nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append( film_id) if obj: shows = film_block.find( 'div', {'class': 'filmShows'}) for times in shows.findAll( 'a', {'class': 'time'}): try: hours, minutes = times.text.split( ':') except AttributeError: times.find( 'sup').extract() hours, minutes = times.text.split( ':') dtime = datetime.datetime( date['obj'].year, date['obj'].month, date['obj'].day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=obj, cinema=cinema_obj, dtime=dtime, ) schedules.append( sch_id) return nof_films, nof_cinemas, ''
def page_parser(city_name, cinema_name, source): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url=source) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%ssessions/' % source.url today = datetime.date.today() next_week = today + datetime.timedelta(days=6) delta = next_week - today for day in range(delta.days + 1): date_obj = today + datetime.timedelta(days=day) url = '%s%s' % (main_url, date_obj) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id="section-session") if main: main = main.find('table') for tr in main.findAll('tr'): showtime, film = tr.findAll('td', limit=2) hours, minutes = showtime.string.split(':') film_a = film.findAll('a') if film_a: film_a = film_a[1] if len(film_a) > 1 else film_a[0] full_url = film_a.get('href') film_id = full_url.replace('%sfilms/', source.url).replace( '/', '').encode('utf-8') film_name = del_screen_type( film_a.get('title').encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: dtime = datetime.datetime( date_obj.year, date_obj.month, date_obj.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) return data_nof_film
def get_vkinocomua_films_and_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://vkino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source) cinemas = {} for ind, i in enumerate(cinemas_data): url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='cinema-showtimes') if main: for content in main.findAll('div', {'class': 'content'}): film_tag = content.find('a', {'class': 'navi'}) film_name = film_tag.string.encode('utf-8').strip() film_slug = low(del_separator(film_name)) full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\/\d+\/', full_url) if film_id: film_id = film_id[0].replace('/','').encode('utf-8') else: film_id = film_slug full_url = '%s%s' % (source.url, full_url.lstrip('/')) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for div in content.findAll('div', {'class': 'date'}): year, month, day = div['data-date'].split('-') show = div.find_next_sibling("ul") for li in show.findAll('li'): if li.a: extra = li.a.get('href') hours, minutes = li.a.text.strip().split(':') else: extra = None hours, minutes = li.text.strip().split(':') # sale = True if extra else False dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8')) sch_id = sch_id.replace(' ', '') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = i, dtime = dtime, extra = extra, ) schedules.append(sch_id) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_cinemaarthall_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Норильск' cinema_name = 'Синема-АРТ-Холл' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://cinemaarthall.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) dates = [] url = '%spage/kino/films/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', id='datachek') for a in show_days.findAll('a'): day = a.get('href').replace('/page/kino/films/&date=', '') dates.append(day) for d in dates: url = '%spage/kino/films/&date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) for div in data.findAll('div', {'class': 'media-block'}): film_name = div.find('h3') if film_name: film_name = film_name.string.encode('utf-8') film_id = div.findAll('a', limit=1)[0].get('href').replace( '/', '').encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%spage/kino/films/%s' % (source.url, film_id) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: div_sess = div.find('div', {'class': 'filmr'}) for t in div_sess.findAll('span'): if t.string: t = t.string.split(',')[0] hours, minutes = t.split(':') day, month, year = d.split('.') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_oreanda_and_spartak(): ignored = get_ignored_films() city_name = 'Ялта' city_slug = low(del_separator(city_name)) xdata = ( { 'url': 'http://yaltakino.com/Oreanda/', 'eng': 'Oreanda', 'ru': 'Ореанда' }, { 'url': 'http://yaltakino.com/Spartak/', 'eng': 'Spartak', 'ru': 'Спартак' }, ) for data in xdata: data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url=data['url']) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_name = data['ru'] cinema_eng = data['eng'] cinema_slug = low(del_separator(cinema_name)) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%sschedule/' % source.url req = urllib.urlopen(main_url) if req.getcode() == 200: data = BeautifulSoup(req.read()) main = data.find('td', {'class': 'contentplaceholder'}) for div in main.findAll('div', {'class': 'scheduleDayCaption'}): sess_date, sess_day = div.text.split(' / ') day, month = sess_date.split() month = get_month(month.encode('utf-8')) year = datetime.datetime.now().year table = div.find_next('table') for tr in table.findAll('tr'): if tr.find('td', {'class': 'scheduleTime'}): hour, minute = tr.find('td', { 'class': 'scheduleTime' }).text.split(':') film = tr.find('a', {'class': 'scheduleLink'}) film_url = film.get('href') full_url = 'http://yaltakino.com%s' % film_url film_id = film_url.replace('/%s/?filmid=' % cinema_eng, '') film_name = del_screen_type( film.text.encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id.encode( 'utf-8') not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id.encode('utf-8')) if objt: dtime = datetime.datetime( year, int(month), int(day), int(hour), int(minute)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_rottentomatoes_films(everyday=True): def get_critic(block): critic = block.findAll('div', id="scoreStats", limit=1) if critic: critic = critic[0].findAll('div') average = critic[0].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() reviews = critic[1].findAll('span', limit=2)[1].text.strip() fresh = critic[2].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() rotten = critic[3].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() return '%s;%s;%s;%s' % (average.replace( '/10', ''), reviews, fresh, rotten) else: return 'N/A;0;0;0' ''' critic = block.findAll('p', {'class': 'critic_stats'}, limit=1)[0] average, reviews = critic.findAll('span', limit=2) try: fresh, rotten = reviews.next_sibling.next_sibling.encode('utf-8').strip().split(' | ') except AttributeError: return 'N/A;0;0;0' fresh = fresh.replace('Fresh:','').strip() rotten = rotten.replace('Rotten:','').strip() average = average.string.encode('utf-8').split('/')[0] reviews = reviews.string.encode('utf-8') return '%s;%s;%s;%s' % (average, reviews, fresh, rotten) ''' source = ImportSources.objects.get(url='http://www.rottentomatoes.com/') sfilm_clean(source) noffilms = [] data_nof_film = '' filter = {'source_obj': source} if everyday: today = datetime.datetime.today().date() day7 = today + datetime.timedelta(days=7) today = today - datetime.timedelta(days=30) filter['text__gte'] = today filter['text__lt'] = day7 exists = get_source_data(source, 'film', 'list') films = {} source_films = SourceFilms.objects.filter(**filter) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ignored = get_ignored_films() opener = urllib2.build_opener() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1', } opener.addheaders = headers.items() updated = [] for k, f in films.items(): film_url = '%s%s' % (source.url, k) req = opener.open(film_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") extra = get_critic(data) f.extra = extra f.save() updated.append(k) time.sleep(1) u = 'http://www.rottentomatoes.com/api/private/v1.0/m/list/find?page=1&limit=50&type=opening&minTomato=0&maxTomato=100&minPopcorn=0&maxPopcorn=100&services=&genres=1%3B2%3B4%3B5%3B6%3B8%3B9%3B10%3B11%3B13%3B14%3B18&sortBy=popularity&certified=false' req = opener.open(u) if req.getcode() == 200: data = json.loads(req.read(), encoding="latin-1") for i in data['results']: title = i['title'].encode('utf-8') title_slug = low(del_separator(title)) url = i['url'].lstrip('/') full_url = '%s%s' % (source.url, url) if url not in exists and url not in noffilms: if title_slug.decode( 'utf-8') not in ignored and url not in updated: time.sleep(1) req2 = opener.open(full_url) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") year_block = data2.find('h1', {'class': 'title hidden-xs'}) if not year_block: year_block = data2.find('h1', id='movie-title') year_tmp = year_block.find('span', { 'class': 'h3 year' }).text.encode('utf-8') year = int(year_tmp.replace('(', '').replace(')', '')) release_date = data2.find('td', itemprop="datePublished") if release_date: release_date = release_date.get('content') extra = get_critic(data2) obj = films.get(url) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid obj.extra = extra obj.save() else: kid, info = film_identification(None, title_slug, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(url, kid, source, title, txt=release_date, extra=extra) films[url] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( title, title_slug, None, None, url.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(url) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы, рейтинг')
def get_kinobusiness(request, country_data): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.kinobusiness.com/') country = Country.objects.get(name=country_data['ru']) bx_ids = list(BoxOffice.objects.filter(country=country).values_list( 'bx_id', flat=True)) films = BoxOffice.objects.filter(country=country).distinct('kid') films_dict = {} for i in films: films_dict[i.source_id] = i.kid data_nof_films = '' data_nof_distr = '' noffilms = [] nofdistr = [] if country_data['en'] == 'usa': main_url = '%skassa_world_prokata/kassa-usa/' % source.url add = '' else: main_url = '%skassovye_sbory/weekend/' % source.url add = 'usd/' req = urllib.urlopen(main_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', {'class': 'table-responsive'}) data = div.findAll('table', limit=1)[0] # data = data.find('table', {'class': "table table-striped table-hover calendar_year ned"}) tr = data.findAll('tr', limit=2)[1] a = tr.findAll('a')[0].get('href').lstrip('/') req = urllib.urlopen('%s%s%s' % (source.url, a, add)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") date = data.find('h1', {'class': 'film__title'}) date = date.find('small').text.encode('utf-8') to_day, to_month, to_year = re.findall( r'\-\s[\d+\.?]+', date)[0].replace('- ', '').split('.') date_to = datetime.date(int(to_year), int(to_month), int(to_day)) date_from = date_to - datetime.timedelta(days=3) counter = 0 main = data.find('table', id="krestable") for index, tr in enumerate(main.findAll('tr')): if index != 0: if country_data['en'] == 'usa': trs = tr.findAll('td', limit=5) film_name = trs[2].text.strip().encode('utf-8') film_name_orig = trs[3].text.strip().encode('utf-8') a = trs[2].find('a') else: trs = tr.findAll('td', limit=5) film_name = trs[3].text.strip().encode('utf-8') film_name_orig = trs[4].text.strip().encode('utf-8') a = trs[3].find('a') url = a.get('href').encode('utf-8') if a else None film_name = film_name.replace('*', '') film_slug = low(del_separator(film_name)) film_slug_orig = low(del_separator(film_name_orig)) full_url = '' if url: full_url = '%s%s' % (source.url, url.lstrip('/')) full_url = full_url.encode('utf-8') film_id = film_slug.decode('utf-8') film_slug_orig = film_slug_orig.decode('utf-8') bx_id = '%s%s%s%s%s' % ( film_id, film_slug_orig, date_from, date_to, country_data['dump']) if bx_id not in bx_ids: distributors = [] week_audience = None td = tr.findAll('td') if country_data['en'] == 'usa': distributors = td[4].text week_sum = int(float(td[5].text.replace( u' ', '').replace(u',', u'.'))) screens = int(float(td[7].text.replace( u' ', '').replace(u',', u'.').replace(u'-', u'0'))) all_sum = int(float(td[9].text.replace( u' ', '').replace(u',', u'.'))) days = int(float(td[11].text.replace( u' ', '').replace(u',', u'.'))) * 7 all_audience = None else: distributors = td[5].text week_sum = int(float(td[6].text.replace( u' ', '').replace(u',', u'.'))) screens = int(float(td[8].text.replace( u' ', '').replace(u',', u'.').replace(u'-', u'0'))) days = int(float(td[10].text.replace( u' ', '').replace(u',', u'.'))) all_sum = int(float(td[11].text.replace( u' ', '').replace(u',', u'.'))) all_audience = td[12].text.replace( u' ', '').replace(u',', u'.') all_audience = int(float(all_audience)) if all_audience else None if distributors: distributors = distributors.encode( 'utf-8').replace('*', '').split('/') else: distributors = [] dlist = [] for dname in distributors: dname = dname.strip().replace('&', '&') dname_slug = low(del_separator(dname)) if dname_slug not in nofdistr: distr, status = distributor_identification( dname, dname_slug) if distr: dlist.append(distr) else: data_nof_distr += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % (dname.replace('&', '&'), dname_slug, None) nofdistr.append(dname_slug) if dlist: if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: film_obj = films_dict.get(film_id) if not film_obj: ''' req2 = urllib.urlopen(full_url) if req2.getcode() == 200: counter += 1 data2 = BeautifulSoup(req2.read()) film_details = data2.find('table', {'class': 'news-detail'}) year = None for p in film_details.findAll('p'): if p.b: year_tag = p.b.string.encode('utf-8').strip() if year_tag == 'Год:': year = re.findall(r'\d+', p.text.encode('utf-8').strip())[0] if year: ''' d1, d2 = (dlist[0].kid, dlist[1].kid) if len(dlist) > 1 else (dlist[0].kid, None) kid, info = film_identification( film_slug, None, d1, d2, source=source) if kid: film_obj = kid films_dict[film_id] = kid else: data_nof_films += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if film_obj: boxoffice = BoxOffice.objects.create( bx_id=bx_id, source_id=film_id, source_obj=source, name=film_name, kid=film_obj, screens=screens, date_from=date_from, date_to=date_to, week_sum=week_sum, all_sum=all_sum, week_audience=week_audience, all_audience=all_audience, days=days, country=country, ) for i in dlist: boxoffice.distributor.add(i) bx_ids.append(bx_id) if counter % 3 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_distr) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'boxoffice%s' % country_data['dump'], 'Кассовые сборы %s' % country_data['ru']) return HttpResponseRedirect(reverse("boxoffice_admin", kwargs={'country': country_data['en']}))
def tvzavr_ident(): source = ImportSources.objects.get(url='http://www.tvzavr.ru/') sfilm_clean(source) path = '%s/dump_%s_index.xml' % (settings.API_DUMP_PATH, source.dump) data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) with open(path, 'r') as f: data = BeautifulSoup(f.read(), "html.parser") for i in data.findAll('url'): title = i.find('video:video').find('video:title').text.encode('utf-8') slug = low(del_separator(title)) film_id = i.find('tvzavr:video').find('tvzavr:id').text if not 'серия' in slug and film_id not in noffilms: if slug.decode('utf-8') not in ignored: url = i.find('loc').text.encode('utf-8') year = i.find('tvzavr:video').find('tvzavr:year').text obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, title, year=year, extra=url) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(title, slug, None, None, film_id.encode('utf-8'), info, url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def get_rambler_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.rambler.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) noffilms = [] data_nof_films = '' ''' # LOCALHOST f = open('%s/dump_rambler_films.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: if xml: # --- end localhost ''' # SERVER f = open('%s/dump_rambler_index.xml' % settings.API_DUMP_PATH, 'r') xml_index = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() creations = xml_index.find('creations') filenames = [] for i in creations.findAll('file'): filename = i.get('filename') if filename: filenames.append(filename) for i in filenames: url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/%s' % ( RAMBLER_API_KEY, i) req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('creation'): film_id = i.objectid.string if film_id not in noffilms: try: year = int(i.year.string) if i.year.string else None except UnicodeEncodeError: year = None full_url = 'https://kassa.rambler.ru/movie/%s' % film_id name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if year and name_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: try: kid, info = film_identification( name_slug, None, {}, {}, year=year, source=source) except db.backend.Database._mysql.OperationalError: next_step = False if next_step: objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_films += xml_noffilm( name, name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films.replace('&', '&')) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_kinoteatrua_films_and_persons(): ''' Получение фильмов ''' opener = give_me_cookie() source = ImportSources.objects.get(url='http://kino-teatr.ua/') sfilm_clean(source) try: with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") except IOError: xml_data = BeautifulSoup('', from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] data_nof_film = '' persons_dict = {} data_nof_persons = '' films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) year = datetime.datetime.now().year lang = Language.objects.get(name='Украинский') def get_persons(data): persons = {} tags = ['director', 'actor'] for tag in tags: for p in data.findAll('span', itemprop=tag): person_id = p.a.get('href') person_id = long(re.findall(r'\d+', person_id)[0]) if p.a.text: persons[person_id] = p.a.text.encode('utf-8') return persons films_urls = get_kinoteatrua_films_links( 'http://kino-teatr.ua/films-near.phtml', 1, year, source, opener) for ind, film in enumerate(films_urls): film_ua_url = film['url'].replace(source.url, '%suk/' % source.url) req_text = opener.open(urllib2.Request(film_ua_url)) if req_text.getcode() == 200: film_data = BeautifulSoup(req_text.read(), from_encoding="utf-8") persons = get_persons(film_data) persons_dict[film['id']] = persons name = film_data.find('div', { 'class': 'myriadFilm' }).text.encode('utf-8') name = name.replace('Фільм ', '').strip() text = film_data.find('div', itemprop='description') text_data = text.findAll('p', limit=1) if text_data: text = text_data[0].text.encode('utf-8') else: text = text.text.encode('utf-8').strip() text = text.replace('редактирование синопсиса', '').strip() if text in ('Проект оголошений', 'Підготовка до зйомок'): text = '' film_slug = low(del_separator(film['name'])) temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in ignored and temp_film_slug not in films_slugs: obj = films.get(film['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film['id'], kid, source, name, year=film.get('year'), txt=text) films[film['id']] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: if temp_film_slug not in films_slugs: data_nof_film += xml_noffilm( film['name'], film_slug, None, None, film['id'], info, film['url'].encode('utf-8'), source.id) if objt and not create_new: try: film_text = objt.text.encode('utf-8') except UnicodeDecodeError: film_text = objt.text if film_text != text: objt.text = text objt.save() if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_film) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) cron_success('html', source.dump, 'films', 'Укр. фильмы') # persons persons_nof_list = [] persons_list = [] for ind, film in enumerate(films_urls): req = opener.open(urllib2.Request(film['url'])) if req.getcode() == 200: film_data = BeautifulSoup(req.read(), from_encoding="utf-8") persons = get_persons(film_data) for person_id, person_ru_name in persons.iteritems(): if person_id not in persons_nof_list and person_id not in persons_list: ukr_person = persons_dict.get(film['id']) if ukr_person: ukr_person_name = ukr_person.get(person_id) if ukr_person_name: ukr_person_name_slug = low( del_separator(ukr_person_name)) person_ru_name_slug = low( del_separator(person_ru_name)) person_obj = Person.objects.filter( name__name=person_ru_name_slug).exclude( kid=None) if person_obj.count() == 1: names = [{ 'name': ukr_person_name, 'status': 1 }, { 'name': ukr_person_name_slug, 'status': 2 }] for i in names: name_obj, name_created = NamePerson.objects.get_or_create( name=i['name'], status=i['status'], language=lang, defaults={ 'name': i['name'], 'status': i['status'], 'language': lang, }) if name_obj not in person_obj[0].name.all( ): person_obj[0].name.add(name_obj) else: data_nof_persons += '<person name="%s" slug="%s" code="%s" name_alt="%s" slug_alt="%s"></person>' % ( person_ru_name.replace( '"', "'"), person_ru_name_slug, person_id, ukr_person_name.replace( '"', "'"), ukr_person_name_slug) persons_list.append(person_id) if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_person' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_persons) cron_success('html', source.dump, 'persons', 'Укр. персоны')
def get_arsenalclub_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Нефтекамск' cinema_name = 'Арсенал' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://arsenal-club.com/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() future = today + datetime.timedelta(days=6) delta = future - today for d in range(delta.days + 1): date = today + datetime.timedelta(days=d) flag = False url = '%skino/?rasdel=kino&day=%s#daybox' % (source.url, date.strftime('%d.%m')) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" for table in data.findAll('table', width="100%", cellpadding="3", cellspacing="1", bgcolor="#393939"): trs = table.findAll('tr', bgcolor="#292929") if len(trs) == 0: flag = True else: for tr in trs: times, film, price = tr.findAll('td') full_url = film.a.get('href').encode( 'utf-8') if film.a and film.a.get('href') else None if full_url: film_name = film.a.text.encode('utf-8').strip() else: film_name = film.text.encode('utf-8').strip() film_slug = del_screen_type( low(del_separator(film_name))) if full_url: film_id = full_url.replace( 'http://www.kinopoisk.ru/film/', '').encode('utf-8') else: film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: hours, minutes = times.string.split(':') dtime = datetime.datetime( date.year, date.month, date.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) if flag: break create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_okinoua_releases(): ''' Парсер укр. релизов ''' f = open('%s/dump_okino.ua.links.xml' % settings.API_DUMP_PATH, 'r') links = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() xml = open('%s/dump_okinoua_nof_film.xml' % settings.NOF_DUMP_PATH, 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() films_slugs = [] for i in xml_data.findAll('film'): slug = i.get('slug_ru') films_slugs.append(slug) data_nof_film = '' for index, i in enumerate(links.findAll('release')): url = i.link['value'] distr = i.distr['value'] film_id = i.id['value'] req = urllib.urlopen(url) if req.getcode() == 200: html_data = BeautifulSoup(req.read(), from_encoding="utf-8") title = html_data.find('div', {'class': 'item'}) name_ru = title.h1.text.encode('utf-8') name_ua = None if title.h4: name_ua = title.h4.text.encode('utf-8') if name_ua == '(Не)очікуваний принц (Un prince (presque) charmant)': name_ua = '(Не)очікуваний принц' else: name_ua = re.sub(r'\(.*?\)', '', name_ua).strip() name_ua = name_ua if re.findall( ur'[а-яА-Я]', name_ua.decode('utf-8')) else None name_slug = del_screen_type(name_ru) name_slug = low(del_separator(name_slug)) details = html_data.find('div', {'class': 'params'}) release_date = None year_m = None for i in details.ul.findAll('li'): if i.span.text == u'Год:': year_main = i.text.split(':') year_m = year_main[1].strip() elif i.span.text == u'Премьера в Украине:': release_txt = i.text.split(':') day, month, year = release_txt[1].strip().split(' ') month = int(get_month(month.encode('utf-8'))) release_date = datetime.date(int(year), month, int(day)) kid, info = film_identification(name_slug, None, {}, {}, year=year_m, source=source) if kid: if release_date: obj, created = Okinoua.objects.get_or_create( url=url, defaults={ 'url': url, 'distributor': distr, 'release': release_date, 'kid': kid, 'name_ru': name_ru, 'name_ua': name_ua, }) if not created: if obj.distributor != distr: obj.distributor = distr if obj.release != release_date: obj.release = release_date name_ua = name_ua.decode('utf-8') if name_ua else None if obj.name_ua != name_ua: obj.name_ua = name_ua obj.save() else: slug_tag = 'slug_ru="%s"' % name_slug if slug_tag not in data_nof_film and name_slug.decode( 'utf-8') not in films_slugs: data_nof_film += xml_noffilm(name_ru, name_slug, name_ua, None, film_id.encode('utf-8'), info, url.encode('utf-8'), source.id) # на каждом 4 обращении к источнику делаю паузу в 1-3 секунды if index % 3 == 0: time.sleep(random.uniform(1.0, 3.0))
def get_cinema5_schedules(): data_nof_cinema = '' data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://cinema5.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') data = [ { 'city': 'Нижнекамск', 'url': '%snk' % source.url }, { 'city': 'Оренбург', 'url': '%soren' % source.url }, { 'city': 'Саратов', 'url': '%ssaratov' % source.url }, { 'city': 'Уфа', 'url': '%sufa' % source.url }, { 'city': 'Чебоксары', 'url': '%scheby' % source.url }, ] params = ['today', 'tomorrow', '+2days'] cinema_name = 'Синема 5' cinema_slug = low(del_separator(cinema_name)) for i in data: city_slug = low(del_separator(i['city'])) city = City.objects.get(name__name=i['city'], name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': i['city'], }) cinema = None try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, i['city'], city_obj.city.kid) if cinema: cinema_id = '%s_%s' % (cinema_slug, city_slug) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_id, source_obj=source, defaults={ 'source_id': cinema_id, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) for param in params: url = '%s?date=%s' % (i['url'], param) req = urllib.urlopen(url) if req.getcode() == 200: page_data = BeautifulSoup(req.read()) divs = page_data.find('div', {'class': 'content clearfix'}) showdate = divs.find('h1') if showdate: showdate = showdate.string.encode('utf-8') day, month, year = showdate.replace( 'Расписание на ', '').strip().split('.') for div in divs.findAll('div', {'class': 'show-wrapper'}): film_name = div.find('div', { 'class': 'title' }).string.encode('utf-8') film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: for span in div.findAll( 'span', {'class': 'time'}): hours, minutes = span.string.strip( ).split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_id, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_okinoua_distributors(request): form = OkinoUploadForm() if request.POST: form = OkinoUploadForm(request.POST, request.FILES) if form.is_valid(): source = ImportSources.objects.get(url='http://www.okino.ua/') with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] today = datetime.date.today() films_dict = get_source_data(source, 'film', 'dict') releases = SourceReleases.objects.select_related('film').filter( film__source_obj=source, release__gte=today) releases_dict = {} for i in releases: releases_dict[i.film.source_id] = i data_nof_films = '' data = request.FILES['file'].read() html_data = BeautifulSoup(data, from_encoding="utf-8") main = html_data.find('div', {'class': 'release_list'}) year = datetime.date.today().year first_h3 = main.findAll('h3', limit=1)[0] for div in first_h3.find_next_siblings(): film_tag = div.find('p', {'class': 'name'}) flag = False if film_tag: flag = True film_tag = film_tag.a film_name = film_tag.string.encode('utf-8') full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\d+\/$', full_url)[0].replace( '/', '').encode('utf-8') film_slug = low(del_separator(film_name)) film_year = div.find('span', { 'class': 'y' }).string.encode('utf-8').replace('(', '').replace(')', '') full_url = 'http://www.okino.ua%s' % full_url release_day = int( div.find('span', { 'class': 'day' }).string) release_month = div.find('span', { 'class': 'month' }).string.encode('utf-8') release_month = get_month(release_month) release_date = datetime.date(year, int(release_month), release_day) film_obj = films_dict.get(film_id) if not film_obj: kid, info = film_identification(film_slug, None, {}, {}, year=film_year, source=source) if kid: film_obj = SourceFilms.objects.create( source_id=film_id, source_obj=source, name=film_name, kid=kid, year=film_year, ) else: temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in films_slugs and temp_film_slug not in ignored: films_slugs.append(film_slug.decode('utf-8')) data_nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) if film_obj: for p in div.findAll('p'): if p.string: text = p.string.encode('utf-8') if 'Дистрибьютор:' in text: distr = text.replace( 'Дистрибьютор: ', '').decode('utf-8') release_obj = releases_dict.get( film_id) if release_obj: if release_obj.release != release_date or release_obj.distributor != distr: release_obj.release = release_date release_obj.distributor = distr release_obj.save() else: release_obj = SourceReleases.objects.create( source_obj=source, film=film_obj, release=release_date, distributor=distr, ) releases_dict[ film_id] = release_obj if div.string: year = int( re.findall(r'\d+$', div.string.encode('utf-8'))[0]) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace( '</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) return HttpResponseRedirect(reverse('admin_source_releases_show')) return render_to_response('release_parser/okinoua_upload.html', {'form': form}, context_instance=RequestContext(request))
def get_kinomagnat_schedules(): ignored = get_ignored_films() data_nof_film = '' data_nof_hall = '' data_nof_cinema = '' noffilms = [] nofhalls = [] city_name = 'Киев' cinema_name = 'Магнат' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinomagnat.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') halls = get_source_data(source, 'hall', 'dict') city = City.objects.get(name__name=city_name, name__status=1) try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: cinema = None data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city.kid) if cinema: city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) cinema_kid = cinema.code city_kid = city.kid today = datetime.date.today() url = '%sseans.html?device=iphone' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) div = data.find('div', {'class': 'contentpaneopen'}) for table in div.findAll('table'): try: day, month = table.find_all_previous("p", limit=2)[1].text.strip().split() except ValueError: try: day, month = table.find_all_previous("p", limit=3)[2].text.strip().split() except ValueError: day, month = table.find_all_previous("p", limit=4)[3].text.strip().split() month = get_month_ua(low(month.encode('utf-8'))) date_sch = datetime.date(today.year, month, int(day)) hall_name = table.findAll('tr', limit=1)[0].text.strip().encode('utf-8') hall_name_slug = low(del_separator(hall_name)) if hall_name_slug not in nofhalls: hall_obj = halls.get(hall_name_slug) if not hall_obj: halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk') if halls_obj.count() == 1: hall_kid = halls_obj[0].kid hall_obj = SourceHalls.objects.create( source_id=hall_name_slug, source_obj=source, cinema=cinema_obj, name=hall_name, kid=hall_kid, ) halls[hall_name_slug] = hall_obj else: id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug) id = id.replace(' ', '') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id) nofhalls.append(hall_name_slug) if hall_obj: for ind, tr in enumerate(table.findAll('tr')): if ind != 0: showtime, film_data = tr.findAll('td', limit=2) hour, minute = showtime.text.strip().encode('utf-8').split(':') dtime = datetime.datetime(date_sch.year, date_sch.month, date_sch.day, int(hour), int(minute)) a = film_data.find('a') film_id = a.get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sch_id = '%s%s%s' % (dtime, hall_obj.id, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, hall=hall_obj.kid, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_okinoua_films(): """ Парсинг фильмов Украины """ xml = open('%s/dump_okinoua_nof_film.xml' % settings.NOF_DUMP_PATH, 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() films_slugs = [] for i in xml_data.findAll('film'): slug = i.get('slug_ru') films_slugs.append(slug) source = ImportSources.objects.get(url='http://www.okino.ua/') data_nof_films = '' not_founded_films = [] # Получаем словарь идентифицированных фильмов OkinoUA okinoua_films = get_source_data(source, 'film', 'list') # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') # Получаем словарь со списком идентифицированных кинотеатров OkinoUA okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict') counter = 0 for city_id, city_obj in okinoua_cities_dict.iteritems(): counter += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) dates = [] if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") for div in page.findAll('div', {'class': 'item0'}): for film in div.findAll('div', {'class': 'item2'}): alt_name = None if film.div.div.a: film_name = film.div.div.a.string.encode('utf-8') film_a = film.div.div.a.get('href') film_id = film_a.replace('/film/', '').replace( '/', '').encode('utf-8') full_url = '%sfilm/%s' % (source.url, film_id) req_name = urllib.urlopen(full_url) if req_name.getcode() == 200: filmpage = BeautifulSoup(req_name.read(), from_encoding="utf-8") title = filmpage.find('div', {'class': 'item'}) if title.h4: alt_name = title.h4.text.encode('utf-8') alt_name = re.sub(r'\(.*?\)', '', alt_name).strip() else: film_name = film.div.div.string.strip().encode('utf-8') film_id = None film_name_slug = low( del_separator(del_screen_type(film_name))) if not film_id: film_id = film_name_slug.decode('utf-8') if film_id not in okinoua_films: kid, info = film_identification(film_name_slug, None, {}, {}, source=source) if kid: film_obj, created = SourceFilms.objects.get_or_create( source_id=film_id, source_obj=source, defaults={ 'source_id': film_id, 'source_obj': source, 'name': film_name, 'kid': kid, 'name_alter': alt_name, }) else: slug_tag = 'slug_ru="%s"' % film_name_slug if slug_tag not in data_nof_films and film_name_slug.decode( 'utf-8') not in films_slugs: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) okinoua_films.append(film_id) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('okinoua_nof_film', settings.NOF_DUMP_PATH, xml_data) cron_success('html', 'okinoua', 'films', 'Фильмы')
def get_megamag(): ''' Получение urls фильмов ''' import cookielib def give_me_cookie(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler()) return opener ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://megamag.by/') sfilm_clean(source) megamag_cities_dict = get_source_data(source, 'city', 'dict') megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) cities_data = {} data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] schedules_data = [] opener = give_me_cookie() req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php')) event_dict = {} if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities = data.find('div', id="box-region") for i in cities.findAll('a'): city_name = i.text.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i.get('href').replace( 'http://kinoteatr.megamag.by/index.php?region_id=', '') mcity = megamag_cities_dict.get(city_id) if not mcity: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: mcity = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if mcity: cities_data[city_name] = mcity try: cinemas_tag = data.findAll('td', {'class': 'Cinema_new_box_1_BoxText'}, limit=1)[0] except IndexError: cinemas_tag = None if cinemas_tag: for i in cinemas_tag.findAll('a'): cinema_url = i.get('href') cinema_id = cinema_url.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '') cinema_obj = megamag_cinemas_dict.get(cinema_id) opener = give_me_cookie() try: req2 = opener.open(urllib2.Request(cinema_url)) if req2.getcode() == 200: schedules_page = BeautifulSoup(req2.read(), from_encoding="utf-8") city_name = schedules_page.findAll( 'div', {'class': 'object_param_value'}, limit=1)[0].text.encode('utf-8') city_obj = cities_data.get(city_name) if city_obj: cinema_name = schedules_page.find( 'div', { 'class': 'object_title' }).text.encode('utf-8') cinema_name = cinema_name.replace('"', '').replace( 'Кинотеатр', '') cinema_slug = low(del_separator(cinema_name)) cinema_ig_id = u'%s__%s' % ( cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: cinema_kid = cinema_obj.cinema.code if cinema_kid: for event in schedules_page.findAll( 'td', {'class': 'eventsHeading'}): if event.a.get('name'): ev = event.a['name'].split('_')[1] fname = event.a.text.encode( 'utf-8') fid = event.a.get('href').replace( 'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=', '') event_dict[int(ev)] = { 'name': fname, 'id': int(fid) } links = [] for td in schedules_page.findAll( 'td', {'class': 'main'}): for link in td.findAll('a'): l = link.get('href') if l and 'cPath' in l: links.append(l) schedules_data.append({ 'mcity': city_obj, 'city': city_obj.city, 'mcinema': cinema_obj, 'cinema': cinema_kid, 'schedules': set(links) }) else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) except httplib.HTTPException: pass create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) megamag = get_source_data(source, 'schedule', 'list') for obj in schedules_data: cinema_object = obj['mcinema'] for index, i in enumerate(obj['schedules']): opener = give_me_cookie() try: req3 = opener.open(urllib2.Request(i)) if req3.getcode() == 200: id_schedule = i.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '').encode('utf-8') if id_schedule not in megamag: sch_page = BeautifulSoup(req3.read(), from_encoding="utf-8") tables = sch_page.findAll('table', { 'class': 'Cinema_new_box_2_TemplateCenterPart' }, limit=1)[0] main_table = tables.findAll('table', cellpadding='4', limit=1)[0] tr = main_table.findAll('tr')[1] td = tr.findAll('strong') event_id = id_schedule.split('_')[2] film_data = event_dict.get(int(event_id)) if film_data: film_name = film_data['name'] film_name_slug = low( del_separator(del_screen_type(film_name))) film_id = film_data['id'] if film_id not in noffilms and film_name_slug.decode( 'utf-8') not in ignored: obj = films.get( str(film_id).decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_name_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[str(film_id).decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append(objt) elif not obj: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: dtime_info = td[1].text.encode( 'utf-8').split() year_info = datetime.datetime.now( ).year day_info = int(dtime_info[0]) month_low = low( dtime_info[1].replace(',', '')) month_info = int( get_month(month_low)) time_info = dtime_info[-1].replace( '(', '').replace(')', '').split(':') dtime = datetime.datetime( year_info, month_info, day_info, int(time_info[0]), int(time_info[1]), 0) SourceSchedules.objects.create( source_id=id_schedule, source_obj=source, cinema=cinema_object, film=objt, dtime=dtime, ) except httplib.HTTPException: open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('%s\n' % i) # на каждом 60 обращении к источнику делаю паузу в 2 секунды if (index + 1) % 60 == 0: time.sleep(2.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_premierzal_schedules(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.premierzal.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cities_cinemas = {} for i in SourceCinemas.objects.select_related('city').filter( source_obj=source): if not cities_cinemas.get(i.city.source_id): cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []} cities_cinemas[i.city.source_id]['cinemas'].append(i) for k, v in cities_cinemas.iteritems(): city_url_encode = urllib.quote(v['city'].name.encode('utf-8')) for i in v['cinemas']: main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id, city_url_encode) main_req = urllib.urlopen(main_url) if main_req.getcode() == 200: data = BeautifulSoup(main_req.read()) data = data.find('div', id="films-list") if data: dates = [] for calendar in data.findAll('table', {'class': 'calendar'}): for a in calendar.findAll('a'): href = a.get('href', '') href_dict = dict(cgi.parse_qsl(href)) calendar_date = href_dict.get( u'?date', href_dict.get(u'date')) if calendar_date: dates.append({ 'date': calendar_date, 'href': href }) for ind, d in enumerate(dates): films_blocks = [] if ind == 0: films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) else: url = '%s?date=%s&city=%s&theatre=%s' % ( source.url, d['date'], city_url_encode, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) data = data.find('div', id="films-list") films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) time.sleep(random.uniform(0.8, 2.2)) for block in films_blocks: title = block.find('div', { 'class': 'title' }).find('a') film_name = title.text.encode('utf-8').strip() film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: year, month, day = d['date'].split( u'-') for tm in block.findAll( 'div', {'class': 'seanse-item'}): for t in tm.text.encode( 'utf-8').split('|'): t = re.findall( r'\d{2}\:\d{2}', t) if t: hours, minutes = t[ 0].strip().split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % ( dtime, i.source_id.encode( 'utf-8'), film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=i, dtime=dtime, ) schedules.append( sch_id) time.sleep(random.uniform(1.1, 1.8)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinohod_films(): # print "BEGIN get_kinohod_films()" ignored = get_ignored_films() t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 noffilms = [] source = ImportSources.objects.get(url='http://kinohod.ru/') sfilm_clean(source) kinohod_cities = get_source_data(source, 'city', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_films = '' main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY for city_id in kinohod_cities: try: url = '%s&city=%s' % (main_url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 film_id = str(i['id']).decode('utf-8') year = int( i['productionYear']) if i['productionYear'] else None name_ru = i['title'].encode('utf-8') name_ru_slug = low(del_separator(del_screen_type(name_ru))) full_url = '%smovie/%s/' % (source.url, film_id) name_en = None name_en_slug = None if i['originalTitle']: name_en = i['originalTitle'].encode('utf-8') name_en_slug = low( del_separator(del_screen_type(name_en))) if year and name_ru_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: try: if obj: kid = obj.kid else: kid, info = film_identification( name_ru_slug, name_en_slug, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name_ru, name_alt=name_en, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) cron_data_new += '%s<br />' % name_ru elif not obj: if not name_en: name_en = '*' name_en_slug = '*' data_nof_films += xml_noffilm( name_ru, name_ru_slug, name_en, name_en_slug, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) cron_data_nof += '%s<br />' % name_ru except db.backend.Database._mysql.OperationalError: pass except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт фильмов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'films', 'Фильмы')
def get_cinemaplex_releases(): ignored = get_ignored_films() distr_nof_data = '' data_nof_film = '' noffilms = [] nof_distributors = [] distributors = {} source = ImportSources.objects.get(url='http://cinemaplex.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() url = '%s2013/01/30/release-schedule.html' % source.url ''' with open('cinemaplex.htm','r') as f: main = BeautifulSoup(f.read(), from_encoding="utf-8") if main: ''' req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', {'class': 'post-entry'}) main = main.find('tbody') release_date = None for tr in main.findAll('tr'): all_td = tr.findAll('td') if len(all_td) == 1: if all_td[0].text.strip(): try: release_first, release_last = all_td[0].text.encode( 'utf-8').split('—') except ValueError: try: release_first, release_last = all_td[ 0].text.encode('utf-8').split('–') except ValueError: release_first, release_last = all_td[ 0].text.encode('utf-8').split('-') release_first = release_first.replace('\xc2\xa0', '').strip() try: release_first = int(release_first) except ValueError: release_last = release_first release_first = release_first.split()[0].strip() release_month = release_last.strip().split()[1] release_day = int(release_first) release_month = int(get_month(release_month)) past_month_range = [] for m in [1, 2, 3, 4]: past_dates = today - relativedelta(months=+m) past_month_range.append(past_dates.month) if release_month in past_month_range or ( release_month == today.month and release_day <= today.day): release_date = None else: release_year = today.year if release_month >= today.month else today.year + 1 release_date = datetime.date(release_year, release_month, release_day) elif release_date: film_name = all_td[0].text.encode('utf-8').strip() distributor = all_td[1].text.encode('utf-8').replace( '&', '&').split(',')[0].strip() #copies = all_td[2].text.encode('utf-8').strip() runtime = all_td[3].text.encode('utf-8').strip() #genres = all_td[5].text.encode('utf-8').strip() #limits = all_td[7].text.encode('utf-8').strip() try: details = all_td[8].text.encode('utf-8').strip() except IndexError: details = '' f_name = film_name.split('/') if len(f_name) == 2: f_name_ru, f_name_en = (f_name[0].strip(), f_name[1].strip()) else: f_name_ru, f_name_en = (f_name[0].strip(), f_name[0].strip()) film_slug_ru = low(del_separator(f_name_ru)) film_slug_en = low(del_separator(f_name_en)) film_slug = low(del_separator(film_name)) film_id = film_slug full_url = None ''' current_release_date = re.findall(r'с\s\d+\.\d+', details) if current_release_date: current_release_day = current_release_date[0].replace('с ','').split('.')[0] current_release_date = datetime.date(int(release_date.year), int(release_date.month), int(current_release_day)) else: current_release_date = release_date ''' if film_slug_ru: if film_id not in noffilms and film_slug_ru.decode( 'utf-8') not in ignored: # дистрибьютор distributor_slug = low(del_separator(distributor)) distributor_kid = distributors.get(distributor_slug) if not distributor_kid and distributor_slug.decode( 'utf-8') not in nof_distributors: distr, status = distributor_identification( distributor, distributor_slug) if distr: distributor_kid = distr.kid if distr.kid else None distributors[ distributor_slug] = distributor_kid else: distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % ( distributor, distributor_slug, '') nof_distributors.append( distributor_slug.decode('utf-8')) if distributor_kid: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug_ru, film_slug_en, distributor_kid, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, f_name_ru) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'distributor': distributor, 'source_obj': source, 'release': release_date, }) if not sr_created: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() runtime = runtime.replace('-', '').strip() if runtime: runtime = runtime.split("'")[0].split( '’')[0] runtime = runtime.replace("'", '').replace( '’', '') extra = '%s' % runtime if objt.extra != extra: objt.extra = extra objt.save() else: info = 'Нет такого дистрибьютора' data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % distr_nof_data) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Релизы')
def parse_data_ident(request, selected): """Функция для идентификации полученных записей """ #try: debug_logs("start ident %s " % selected) # Начинаем отчет времени выполнения фукнции start = time.time() data_nof_film = '' noffilms = [] ignored = get_ignored_films() # Задаем тип идентификации, для передачи в качестве параметра в функцию идентификации ident_type = 'movie_online' # Делаем выборку всех фильмов из базы с пометкой (afisha_id=None), # так помечаются все фильмы при парсинге, # это фильмы, которые не разу не проходифшие идентификацию киноафиши data = MovieMegogo.objects.filter(afisha_id__in=(0, None)) # Получаем необходимые для идентификации параметры, # проходим итерациями в цикле для каждого отдельного фильма for i in data: year = i.year name_ru = i.title name_en = i.title_en country = i.country # Отчищаем названия ru en для идентификации фильма name_ru_slug = del_separator(low(name_ru)) name_en_slug = del_separator(low(name_en)) # Задаем диапазон лет для идентификации фильма new_year = year + 2 old_year = year - 2 filter_year = {'year__gte': old_year, 'year__lte': new_year} try: # Передаем фильм в функцию на идентификацию kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, filter_year, ident_type, country) if kid: # Записываем результат в модель i.afisha_id = kid i.save() else: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, info, i.page.encode('utf-8')) noffilms.append(i.megogo_id) except db.backend.Database._mysql.OperationalError: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, None, i.page.encode('utf-8')) noffilms.append(i.megogo_id) # Время выполнения функции finish = time.time() timer = "%.2f мин" % ((float(finish - start)) / 60) debug_logs("finish") debug_logs("timer: %s " % timer) debug_logs("Идентификация: название %s / инфо %s %s" % (name_ru_slug, kid, info)) source = ImportSources.objects.get(url='http://megogo.net/') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) # Возвращаемся в интерфейс return simplejson.dumps({ 'request_type': 1, 'timer': timer, })
def get_luxor_films(): query = 'QueryCode=GetMovies' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') sfilm_clean(source) #create_dump_file('%s_films' % source.dump, settings.API_DUMP_PATH, data) data_nof_films = '' noffilms = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ''' xml = open('%s/dump_%s_films.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' ignored = get_ignored_films() xml_data = BeautifulSoup(data, from_encoding="utf-8") for film in xml_data.findAll('movie'): film_id = film['id'].encode('utf-8') film_name = film.find('othername').string.encode('utf-8').replace( '[CDATA[', '').replace(']]', '') film_slug = low(del_separator(del_screen_type(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_planeta_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://planeta-kino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_film = '' noffilms = [] for i in planeta_kino_urls: xml = open( '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, i['city']), 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() for film in xml_data.findAll('movie'): film_id = film['id'] if film_id not in noffilms: film_url = film['url'] film_name = film.title.text.replace( '"', "'").encode('utf-8').strip() film_slug = low(del_separator(del_screen_type(film_name))) if film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, film_name, {}, {}, source=source) if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Фильмы')