def get_luxor_chuvashia_schedules(): today = datetime.datetime.now().strftime('%d.%m.%Y') data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://luxor.chuvashia.com/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') data = [ { 'city': 'Чебоксары', 'cinema': 'Мир Луксор', 'url': '%sschedule.aspx?kinoteatr=luxor' % source.url }, { 'city': 'Новочебоксарск', 'cinema': 'Атал', 'url': '%sschedule.aspx?kinoteatr=atal' % source.url }, ] def get_page_data(date, data_list): url = '%s&date=%s' % (i['url'], date) req = urllib.urlopen(url) if req.getcode() == 200: page_data = BeautifulSoup(req.read()) div = page_data.find('div', id='BodyContener_ScheduleBlock') table = div.find('table', id='BodyContener_TCalendar') for j in div.findAll('div', {'class': 'ScheduleTitle'}): data_list.append({ 'date': date, 'title': j, 'sch': j.next_sibling }) day, month, year = date.split('.') date_obj_current = datetime.date(int(year), int(month), int(day)) for a in table.findAll('a'): link = a.get('href') d = re.findall(r'\=[\d+\.?]+', link.encode('utf-8'))[0].replace('=', '') day, month, year = d.split('.') date_obj = datetime.date(int(year), int(month), int(day)) if date_obj > date_obj_current: get_page_data(d, data_list) return data_list for i in data: city_slug = low(del_separator(i['city'])) city = City.objects.get(name__name=i['city'], name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': i['city'], }) cinema_slug = low(del_separator(i['cinema'])) cinema = Cinema.objects.get(name__name=i['cinema'], name__status=1, city=city) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': i['cinema'], }) data_list = get_page_data(today, []) for schedule in data_list: tag_a = schedule['title'].find('a') film_name = tag_a.text.encode('utf-8') film_slug = low(del_separator(del_screen_type(film_name))) film_url = tag_a.get('href') film_id = film_url.replace('films.aspx?id=', '').encode('utf-8') full_url = '%s%s' % (source.url, film_url) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sch_div = schedule['sch'].find('div', { 'class': 'ScheduleClock' }).text.encode('utf-8').strip() showtimes = re.findall(r'\d+\:\d+', sch_div) day, month, year = schedule['date'].split('.') for showtime in showtimes: hours, minutes = showtime.split(':') dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def cinemate_cc_soon(): ''' login = cinemate_cc_login() if login['error']: return HttpResponse(str(login['error'])) else: opener = login['opener'] source = login['source'] ''' source = ImportSources.objects.get(url='http://cinemate.cc/') opener = give_me_cookie() ignored = get_ignored_films() data_nof_film = '' noffilms = [] sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[int(i.source_id)] = i fdict = get_all_source_films(source, source_films) send_msg = False for main_url in ('%smovies/soon' % source.url, '%smovies/cinema' % source.url): req = opener.open(urllib2.Request(main_url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") nav = data.find('div', {'class': "navigation"}) nav_link = nav.findAll('a')[-1] last_page = int(nav_link.get('href').split('?page=')[-1]) if last_page > 10: last_page = 10 film_list = get_cinemate_cc_film(data, source, ignored, noffilms) for page in xrange(2, (last_page + 1)): time.sleep(random.uniform(1.0, 2.5)) url = '%s?page=%s' % (main_url, page) try: req = opener.open(urllib2.Request(url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") film_list += get_cinemate_cc_film(data, source, ignored, noffilms) except urllib2.HTTPError: pass for i in film_list: obj = films.get(i['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(i['slug'], None, {}, {}, year=i['year'], source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(i['id'], kid, source, i['name'], year=i['year'], txt=datetime.datetime.now().date(), extra='new') films[i['id']] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) send_msg = True elif not obj: data_nof_film += xml_noffilm(i['name'], i['slug'], None, None, i['id'], info, i['url'].encode('utf-8'), source.id) noffilms.append(i['id']) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы в сети') if send_msg: current_site = DjangoSite.objects.get(domain='kinoinfo.ru') msg_from = Profile.objects.get(user__last_name='SYSTEM') msg_to = Profile.objects.get( accounts__login='******') # [email protected] msg = 'В сети появились новые фильмы <a href="http://kinoinfo.ru/torrents/listing/%s/" target="_blank">http://kinoinfo.ru/torrents/listing/%s/</a>' % ( source.id, source.id) try: dialog_exist = DialogMessages.objects.filter( readers__user=msg_to, readers__message__autor=msg_from).order_by('-id')[0] except IndexError: dialog_exist = None reader_type = '1' msg_obj = News.objects.create( title='Сообщение', text=msg, autor=msg_from, site=current_site, subdomain='0', reader_type='1', ) reader = NewsReaders.objects.create(user=msg_to, status='0', message=msg_obj) if dialog_exist: dialog_exist.readers.add(reader) else: dialog_obj = DialogMessages() dialog_obj.save() dialog_obj.readers.add(reader)
def get_premierzal_schedules(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.premierzal.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cities_cinemas = {} for i in SourceCinemas.objects.select_related('city').filter( source_obj=source): if not cities_cinemas.get(i.city.source_id): cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []} cities_cinemas[i.city.source_id]['cinemas'].append(i) for k, v in cities_cinemas.iteritems(): city_url_encode = urllib.quote(v['city'].name.encode('utf-8')) for i in v['cinemas']: main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id, city_url_encode) main_req = urllib.urlopen(main_url) if main_req.getcode() == 200: data = BeautifulSoup(main_req.read()) data = data.find('div', id="films-list") if data: dates = [] for calendar in data.findAll('table', {'class': 'calendar'}): for a in calendar.findAll('a'): href = a.get('href', '') href_dict = dict(cgi.parse_qsl(href)) calendar_date = href_dict.get( u'?date', href_dict.get(u'date')) if calendar_date: dates.append({ 'date': calendar_date, 'href': href }) for ind, d in enumerate(dates): films_blocks = [] if ind == 0: films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) else: url = '%s?date=%s&city=%s&theatre=%s' % ( source.url, d['date'], city_url_encode, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) data = data.find('div', id="films-list") films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) time.sleep(random.uniform(0.8, 2.2)) for block in films_blocks: title = block.find('div', { 'class': 'title' }).find('a') film_name = title.text.encode('utf-8').strip() film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: year, month, day = d['date'].split( u'-') for tm in block.findAll( 'div', {'class': 'seanse-item'}): for t in tm.text.encode( 'utf-8').split('|'): t = re.findall( r'\d{2}\:\d{2}', t) if t: hours, minutes = t[ 0].strip().split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % ( dtime, i.source_id.encode( 'utf-8'), film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=i, dtime=dtime, ) schedules.append( sch_id) time.sleep(random.uniform(1.1, 1.8)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def page_parser(city_name, cinema_name, source): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url=source) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%ssessions/' % source.url today = datetime.date.today() next_week = today + datetime.timedelta(days=6) delta = next_week - today for day in range(delta.days + 1): date_obj = today + datetime.timedelta(days=day) url = '%s%s' % (main_url, date_obj) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id="section-session") if main: main = main.find('table') for tr in main.findAll('tr'): showtime, film = tr.findAll('td', limit=2) hours, minutes = showtime.string.split(':') film_a = film.findAll('a') if film_a: film_a = film_a[1] if len(film_a) > 1 else film_a[0] full_url = film_a.get('href') film_id = full_url.replace('%sfilms/', source.url).replace( '/', '').encode('utf-8') film_name = del_screen_type( film_a.get('title').encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: dtime = datetime.datetime( date_obj.year, date_obj.month, date_obj.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) return data_nof_film
def get_vkinocomua_films_and_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://vkino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source) cinemas = {} for ind, i in enumerate(cinemas_data): url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='cinema-showtimes') if main: for content in main.findAll('div', {'class': 'content'}): film_tag = content.find('a', {'class': 'navi'}) film_name = film_tag.string.encode('utf-8').strip() film_slug = low(del_separator(film_name)) full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\/\d+\/', full_url) if film_id: film_id = film_id[0].replace('/','').encode('utf-8') else: film_id = film_slug full_url = '%s%s' % (source.url, full_url.lstrip('/')) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for div in content.findAll('div', {'class': 'date'}): year, month, day = div['data-date'].split('-') show = div.find_next_sibling("ul") for li in show.findAll('li'): if li.a: extra = li.a.get('href') hours, minutes = li.a.text.strip().split(':') else: extra = None hours, minutes = li.text.strip().split(':') # sale = True if extra else False dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8')) sch_id = sch_id.replace(' ', '') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = i, dtime = dtime, extra = extra, ) schedules.append(sch_id) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_arsenalclub_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Нефтекамск' cinema_name = 'Арсенал' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://arsenal-club.com/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() future = today + datetime.timedelta(days=6) delta = future - today for d in range(delta.days + 1): date = today + datetime.timedelta(days=d) flag = False url = '%skino/?rasdel=kino&day=%s#daybox' % (source.url, date.strftime('%d.%m')) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" for table in data.findAll('table', width="100%", cellpadding="3", cellspacing="1", bgcolor="#393939"): trs = table.findAll('tr', bgcolor="#292929") if len(trs) == 0: flag = True else: for tr in trs: times, film, price = tr.findAll('td') full_url = film.a.get('href').encode( 'utf-8') if film.a and film.a.get('href') else None if full_url: film_name = film.a.text.encode('utf-8').strip() else: film_name = film.text.encode('utf-8').strip() film_slug = del_screen_type( low(del_separator(film_name))) if full_url: film_id = full_url.replace( 'http://www.kinopoisk.ru/film/', '').encode('utf-8') else: film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: hours, minutes = times.string.split(':') dtime = datetime.datetime( date.year, date.month, date.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) if flag: break create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_oreanda_and_spartak(): ignored = get_ignored_films() city_name = 'Ялта' city_slug = low(del_separator(city_name)) xdata = ( { 'url': 'http://yaltakino.com/Oreanda/', 'eng': 'Oreanda', 'ru': 'Ореанда' }, { 'url': 'http://yaltakino.com/Spartak/', 'eng': 'Spartak', 'ru': 'Спартак' }, ) for data in xdata: data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url=data['url']) sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_name = data['ru'] cinema_eng = data['eng'] cinema_slug = low(del_separator(cinema_name)) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) main_url = '%sschedule/' % source.url req = urllib.urlopen(main_url) if req.getcode() == 200: data = BeautifulSoup(req.read()) main = data.find('td', {'class': 'contentplaceholder'}) for div in main.findAll('div', {'class': 'scheduleDayCaption'}): sess_date, sess_day = div.text.split(' / ') day, month = sess_date.split() month = get_month(month.encode('utf-8')) year = datetime.datetime.now().year table = div.find_next('table') for tr in table.findAll('tr'): if tr.find('td', {'class': 'scheduleTime'}): hour, minute = tr.find('td', { 'class': 'scheduleTime' }).text.split(':') film = tr.find('a', {'class': 'scheduleLink'}) film_url = film.get('href') full_url = 'http://yaltakino.com%s' % film_url film_id = film_url.replace('/%s/?filmid=' % cinema_eng, '') film_name = del_screen_type( film.text.encode('utf-8')).strip() film_slug = low(del_separator(film_name)) if film_id.encode( 'utf-8') not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id.encode('utf-8')) if objt: dtime = datetime.datetime( year, int(month), int(day), int(hour), int(minute)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_mailru_soon(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='https://afisha.mail.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() dates = list( map((lambda x: today.date() + relativedelta(months=x)), xrange(1, 13))) dates.insert(0, today.date()) for d in dates: main_url = '%scinema/soon/%s/%s/' % (source.url, d.year, d.month) opener = give_me_cookie() #headers = { # 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.2.2; en-us; Nexus 7 Build/JDQ39E) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30 CyanogenMod/10.1.3/grouper', #} #opener.addheaders = headers.items() try: req = opener.open(urllib2.Request(main_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup(req.read(), "html.parser") for block in data.findAll('div', {'class': 'premiere__date'}): day = block.find('div', {'class': 'premiere__date__mday'}).text if day: release_date = datetime.date(d.year, d.month, int(day)) for item in block.findAll('div', {'class': 'clearin'}): a = item.find('div', { 'class': 'itemevent__head__name' }).find('a') film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) href = a.get('href') film_id = href.replace('/cinema/movies/', '').replace('/', '').encode('utf-8') full_url = '%s%s' % (source.url, href.lstrip('/')) details = item.find('div', { 'class': 'itemevent__head__info' }).text.encode('utf-8') year = re.findall(r'\/\d{4}\/', details) if year: year = int(year[0].replace('/', '')) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) #OFC76 path from U+2009|e2 80 89|THIN SPACE #in film name film_slug = film_slug.decode("utf-8").replace( u"\u2009", '').encode("utf-8") next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'source_obj': source, 'release': release_date, }) if sr_created: try: req = opener.open( urllib2.Request(full_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup( req.read(), "html.parser") movie_pic = data.find( 'div', { 'class': 'movieabout__info__left' }) pic = None if movie_pic: pic = movie_pic.find( 'a', { 'data-module': 'Gallery' }).get('href') txt = None movie_txt = data.find( 'div', { 'class': 'movieabout__info__descr__txt' }) if movie_txt: txt = movie_txt.text.strip( ).encode('utf-8') if pic or txt: objt.text = txt objt.extra = pic objt.save() time.sleep(random.uniform(1.0, 1.5)) else: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() time.sleep(random.uniform(1.0, 2.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Релизы')
def get_cinemaplex_releases(): ignored = get_ignored_films() distr_nof_data = '' data_nof_film = '' noffilms = [] nof_distributors = [] distributors = {} source = ImportSources.objects.get(url='http://cinemaplex.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() url = '%s2013/01/30/release-schedule.html' % source.url ''' with open('cinemaplex.htm','r') as f: main = BeautifulSoup(f.read(), from_encoding="utf-8") if main: ''' req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', {'class': 'post-entry'}) main = main.find('tbody') release_date = None for tr in main.findAll('tr'): all_td = tr.findAll('td') if len(all_td) == 1: if all_td[0].text.strip(): try: release_first, release_last = all_td[0].text.encode( 'utf-8').split('—') except ValueError: try: release_first, release_last = all_td[ 0].text.encode('utf-8').split('–') except ValueError: release_first, release_last = all_td[ 0].text.encode('utf-8').split('-') release_first = release_first.replace('\xc2\xa0', '').strip() try: release_first = int(release_first) except ValueError: release_last = release_first release_first = release_first.split()[0].strip() release_month = release_last.strip().split()[1] release_day = int(release_first) release_month = int(get_month(release_month)) past_month_range = [] for m in [1, 2, 3, 4]: past_dates = today - relativedelta(months=+m) past_month_range.append(past_dates.month) if release_month in past_month_range or ( release_month == today.month and release_day <= today.day): release_date = None else: release_year = today.year if release_month >= today.month else today.year + 1 release_date = datetime.date(release_year, release_month, release_day) elif release_date: film_name = all_td[0].text.encode('utf-8').strip() distributor = all_td[1].text.encode('utf-8').replace( '&', '&').split(',')[0].strip() #copies = all_td[2].text.encode('utf-8').strip() runtime = all_td[3].text.encode('utf-8').strip() #genres = all_td[5].text.encode('utf-8').strip() #limits = all_td[7].text.encode('utf-8').strip() try: details = all_td[8].text.encode('utf-8').strip() except IndexError: details = '' f_name = film_name.split('/') if len(f_name) == 2: f_name_ru, f_name_en = (f_name[0].strip(), f_name[1].strip()) else: f_name_ru, f_name_en = (f_name[0].strip(), f_name[0].strip()) film_slug_ru = low(del_separator(f_name_ru)) film_slug_en = low(del_separator(f_name_en)) film_slug = low(del_separator(film_name)) film_id = film_slug full_url = None ''' current_release_date = re.findall(r'с\s\d+\.\d+', details) if current_release_date: current_release_day = current_release_date[0].replace('с ','').split('.')[0] current_release_date = datetime.date(int(release_date.year), int(release_date.month), int(current_release_day)) else: current_release_date = release_date ''' if film_slug_ru: if film_id not in noffilms and film_slug_ru.decode( 'utf-8') not in ignored: # дистрибьютор distributor_slug = low(del_separator(distributor)) distributor_kid = distributors.get(distributor_slug) if not distributor_kid and distributor_slug.decode( 'utf-8') not in nof_distributors: distr, status = distributor_identification( distributor, distributor_slug) if distr: distributor_kid = distr.kid if distr.kid else None distributors[ distributor_slug] = distributor_kid else: distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % ( distributor, distributor_slug, '') nof_distributors.append( distributor_slug.decode('utf-8')) if distributor_kid: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug_ru, film_slug_en, distributor_kid, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, f_name_ru) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'distributor': distributor, 'source_obj': source, 'release': release_date, }) if not sr_created: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() runtime = runtime.replace('-', '').strip() if runtime: runtime = runtime.split("'")[0].split( '’')[0] runtime = runtime.replace("'", '').replace( '’', '') extra = '%s' % runtime if objt.extra != extra: objt.extra = extra objt.save() else: info = 'Нет такого дистрибьютора' data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % distr_nof_data) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Релизы')
def get_zlat74ru_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Златоуст' cinema_name = 'Космос' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.zlat74.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id='schedule') for tr in div.findAll('tr'): if tr.th: show_date = tr.th.string.encode('utf-8') day, month, year, temp = show_date.split() month = get_month(month) date = datetime.date(int(year), int(month), int(day)) if tr.td: film_tag = tr.td.a film_id = film_tag.get('href').replace('/movies/', '').encode('utf-8') film_name = film_tag.string.encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%smovies/%s' % (source.url, film_id.decode('utf-8')) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for time in tr.findAll('span'): hours, minutes = time.string.split(':') dtime = datetime.datetime( date.year, date.month, date.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinoboomer_schedules(): ignored = get_ignored_films() data_nof_film = '' data_nof_hall = '' data_nof_cinema = '' noffilms = [] nofhalls = [] city_name = 'Киев' cinema_name = 'Boomer' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinoboomer.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') halls = get_source_data(source, 'hall', 'dict') city = City.objects.get(name__name=city_name, name__status=1) try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: cinema = None data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city.kid) film_urls = [] if cinema: city_obj, city_created = SourceCities.objects.get_or_create( source_id = city_slug, source_obj = source, defaults = { 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id = cinema_slug, source_obj = source, defaults = { 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) cinema_kid = cinema.code city_kid = city.kid today = datetime.date.today() url = '%sseances' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) content = data.find('div', {'class': 'view-content'}) for i in content.findAll('h3'): a = i.find('a') film_id = a.get('href').strip().encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) film_urls.append({ 'film_id': film_id, 'film_name': film_name, 'film_slug': film_slug, 'full_url': full_url, }) for i in film_urls: if i['film_id'] not in noffilms and i['film_slug'].decode('utf-8') not in ignored: # Идентифицирую фильм obj = films.get(i['film_id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(i['film_slug'], None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(i['film_id'], kid, source, i['film_name']) films[i['film_id']] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: #open('ddd.txt','a').write(str((type(i['film_name']), type(i['film_slug']), type(i['full_url'])))) data_nof_film += xml_noffilm(i['film_name'], i['film_slug'], None, None, i['film_id'], info, i['full_url'].encode('utf-8'), source.id) noffilms.append(i['film_id']) # если фильм найден, то идентифицирую зал if objt: req = urllib.urlopen(i['full_url']) if req.getcode() == 200: data = BeautifulSoup(req.read()) hall_name = '' content = data.find('div', {'class': 'view-grouping-content'}) if content: wrapper = content.findAll('div', {'class': 'group-wrapper'}, limit=1) if wrapper: widget_links = wrapper[0].findAll('a', {'class': 'vkino-link'}, limit=1) widget_req = urllib.urlopen(widget_links[0].get('href')) if widget_req.getcode() == 200: widget_data = BeautifulSoup(widget_req.read(), from_encoding="utf-8") nav = widget_data.find('div', id='purchase-navigation') li = nav.findAll('li', limit=1)[0] li.a.extract() li.nobr.extract() hall_name = li.text.strip().encode('utf-8').split('«')[-1].split('»')[0] hall_name_slug = low(del_separator(hall_name)) if hall_name and hall_name_slug not in nofhalls: hall_obj = halls.get(hall_name_slug) if not hall_obj: halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk') if halls_obj.count() == 1: hall_kid = halls_obj[0].kid hall_obj = SourceHalls.objects.create( source_id = hall_name_slug, source_obj = source, cinema = cinema_obj, name = hall_name, kid = hall_kid, ) halls[hall_name_slug] = hall_obj else: id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug) id = id.replace(' ','') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id) nofhalls.append(hall_name_slug) if hall_obj: # если зал найден, то получаю сеансы и создаю #day, day_month = wrapper.find('h3').text.strip().split() #day, month = day_month.split('.') #date_sch = datetime.date(today.year, int(month), int(day)) for wrapper in content.findAll('div', {'class': 'group-wrapper'}): widget_links = wrapper.findAll('a', {'class': 'vkino-link'}) for link in widget_links: dtime = link.find('span').get('content').replace('T', ' ').split('+')[0] dtime = datetime.datetime.strptime(dtime, "%Y-%m-%d %H:%M:%S") sch_id = '%s%s%s' % (dtime, hall_obj.id, i['film_id']) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = cinema_obj, hall = hall_obj.kid, dtime = dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kino_ru(): time.sleep(2) current_site = DjangoSite.objects.get(domain='kinoinfo.ru') REG_YEAR = re.compile(r'\d{4}\sгод.') REG_DATETIME = re.compile(r'\s?\-\s\d{2}\:\d{2}\:\d{2}\s\d{2}\s.*\s\d{4}') ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://www.kino.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) users = {} for i in SourceUsers.objects.select_related('profile').filter( source_obj=source): users[i.source_id] = i text_ids = list( NewsFilms.objects.filter(source_obj=source).values_list('source_id', flat=True)) forum_dict = {} urls = (source.url, '%safisha/page/2' % source.url, '%safisha/page/3' % source.url) for url in urls: # фильмы req = urllib.urlopen(url) if not req.getcode() == 200: time.sleep(7) req = urllib.urlopen(url) print "CODE %s" % req.getcode() if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for article in data.findAll('article', {'class': "post"}): film_url = article.find('a', {'class': 'h2'}) film_id = film_url.get('href') full_url = u'%s%s' % (source.url, film_id.lstrip('/')) film_id = film_id.replace('/film/', '') film_name = film_url.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) info_country = article.find('div', {'class': 'info-country'}) year = int(info_country.findAll('a')[-1].text.strip()) comments_exist = article.find('div', {'class': 'comments'}) if comments_exist and film_id.encode( 'utf-8') not in noffilms and film_slug.decode( 'utf-8') not in ignored: forum_href = '%s/comments' % full_url obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name, year=year) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id.encode('utf-8')) if objt: forum_dict[film_id] = { 'obj': objt, 'href': forum_href } # отзывы и авторы for k, v in forum_dict.iteritems(): print k, v req = urllib.urlopen(v['href']) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for post in data.findAll('div', {'class': 'post-comment'}): print post user_data = post.find('a', {'class': 'login_name'}) if user_data: user_name = user_data.text.strip().encode('utf-8') user_id = user_data.get('href').replace('/user/', '') user_obj = users.get(user_id) if user_obj: profile = user_obj.profile else: new_user = get_user() new_user.first_name = user_name new_user.save() profile = Profile.objects.get(user=new_user) user_obj = SourceUsers.objects.create( source_id=user_id, source_obj=source, profile=profile, ) users[user_id] = user_obj date_comment = post.find('div', {'class': 'date-comment'}) print "date comment %s" % date_comment com_time, com_date = date_comment.findAll('a') com_day, com_month, com_year = com_date.text.encode( 'utf-8').strip().split() com_month = get_month(com_month) com_hour, com_minute = com_time.text.encode( 'utf-8').split(':') com_dtime = datetime.datetime(int(com_year), int(com_month), int(com_day), int(com_hour), int(com_minute), 0) text_id = com_time.get('href').replace( '/film/%s/comments/' % k, '') text = post.find('div', { 'class': 'text-comment' }).text.encode('utf-8').strip() if text_id not in text_ids: news = News.objects.create( title='', text=text, visible=True, autor=profile, autor_nick=1, site=current_site, subdomain=0, reader_type='8', ) news.dtime = com_dtime news.save() NewsFilms.objects.create( kid=v['obj'].kid, message=news, source_id=text_id, source_obj=source, ) text_ids.append(text_id)
def get_kinosaturn_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Александров' cinema_name = 'Сатурн' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinosaturn.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" tables = data.findAll('table', width="560", border="0", cellspacing="0", cellpadding="0") for table in tables: film_name = table.find('div', { 'class': u'стиль25' }).text.strip().encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: show_date = table.findAll( 'span', {'class': u'стиль23'}, limit=1)[0].string.strip().encode('utf-8') try: date_from, date_to = show_date.split(' по ') except ValueError: date_from, date_to = show_date.split(' - ') date_from_day, date_from_month = date_from.replace( 'с ', '').split('.') date_to_day, date_to_month = date_to.split('.') date_from = datetime.date(today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date(today.year, int(date_to_month), int(date_to_day)) if date_to < next_month: for cl in (u'стиль23 стиль35 стиль37', u'стиль23 стиль35'): for t in table.findAll('span', {'class': cl}): hours, minutes = t.string.strip().encode( 'utf-8').split(':') delta = date_to - date_from for day in range(delta.days + 1): d = date_from + datetime.timedelta( days=day) dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_surkino_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://surkino.ru/') sfilm_clean(source) cinemas = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') dates = [] req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', {'class': 'days'}) for a in show_days.findAll('a'): dates.append(a.get('href').replace('?date=', '')) for d in dates: url = '%s?date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="windows-1251") div = data.find('div', id='filmlist') if div: for cinema_tag in div.findAll('div', {'class': 'filmname'}): cinema_id = cinema_tag.a.get('href').replace( '?cinema=', '') cinema_obj = cinemas.get(cinema_id) if cinema_obj: films_block = cinema_tag.find_next_siblings('div', limit=1)[0] for tr in films_block.findAll('tr'): film_tag = tr.findAll('a') film_tag = film_tag[1] if len( film_tag) == 2 else film_tag[0] full_url = '%s%s' % (source.url, film_tag.get('href')) film_id = film_tag.get('href').replace( '?film=', '').encode('utf-8') film_name = film_tag.string.encode('utf-8') film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: showtime = tr.td.string.encode('utf-8') hours, minutes = showtime.split('.') year, month, day = d.split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % (dtime, cinema_id, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_planeta_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://planeta-kino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_film = '' noffilms = [] for i in planeta_kino_urls: xml = open( '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, i['city']), 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() for film in xml_data.findAll('movie'): film_id = film['id'] if film_id not in noffilms: film_url = film['url'] film_name = film.title.text.replace( '"', "'").encode('utf-8').strip() film_slug = low(del_separator(del_screen_type(film_name))) if film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, film_name, {}, {}, source=source) if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_luxor_films(): query = 'QueryCode=GetMovies' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') sfilm_clean(source) #create_dump_file('%s_films' % source.dump, settings.API_DUMP_PATH, data) data_nof_films = '' noffilms = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ''' xml = open('%s/dump_%s_films.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' ignored = get_ignored_films() xml_data = BeautifulSoup(data, from_encoding="utf-8") for film in xml_data.findAll('movie'): film_id = film['id'].encode('utf-8') film_name = film.find('othername').string.encode('utf-8').replace( '[CDATA[', '').replace(']]', '') film_slug = low(del_separator(del_screen_type(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_kinobklass_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://kino-bklass.ru/') sfilm_clean(source) city_name = 'Серпухов' cinema_name = 'Кинотеатр в ТРК "Б-Класс"' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now().date() to = today + datetime.timedelta(days=6) delta = to - today for day in range(delta.days + 1): d = today + datetime.timedelta(days=day) url = '%s?date=%s' % (source.url, d.strftime("%Y%m%d")) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.findAll('table', id='rasp', limit=1)[0] for td in table.findAll('td', colspan='10'): full_url = td.a.get('href') film_id = full_url.replace('http://kino-bklass.ru/films/', '').replace('/', '').encode('utf-8') film_name = td.a.h3.string.strip().split(' ')[0].encode( 'utf-8') film_slug = del_screen_type(low(del_separator(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: tr = td.find_next('tr') times = [] for time_tag in tr.findAll('td'): t = None if time_tag.string: t = time_tag.string.strip().encode('utf-8') if time_tag.b: t = time_tag.b.string.strip().encode( 'utf-8') if t: try: hours, minutes = t.split(':') except ValueError: try: hours, minutes = t.split('-') except ValueError: hours, minutes = t.split('^') if hours == '24': hours, minutes = (23, 59) year, month, day = str(d).split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) time.sleep(3.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinoteatr_data(opener, date, city_obj): nof_films = '' nof_cinemas = '' url = '%sru/main/bill/order/cinemas/date/%s.phtml' % (source.url, date['str']) req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='news_page') if main: if main.find('center', {'class': 'xErr'}): return nof_films, nof_cinemas, 'error' for cinema_tag in main.findAll('span', id='afishaKtName'): cinema_name_block = cinema_tag.findAll('a', limit=1)[0] cinema_name = cinema_name_block.text.encode( 'utf-8').replace('Кинотеатр', '') cinema_slug = low( del_separator(del_screen_type(cinema_name))) cinema_name = cinema_name.replace('"', "'").replace( '&', '&').strip() cinema_id = cinema_name_block.get('href').replace( '.phtml', '') if 'cinema_id' in cinema_id: cinema_id = cinema_id.replace( 'http://kino-teatr.ua/ru/main/cinema/cinema_id/', '').encode('utf-8') else: cinema_id = re.findall(r'\d+$', cinema_id)[0] if cinema_id not in nofcinemas: cinema_obj = cinemas_dict.get(str(cinema_id)) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[str(cinema_id)] = cinema_obj except Cinema.DoesNotExist: pass else: try: name_city = city_obj.name.encode('utf-8') except UnicodeDecodeError: name_city = city_obj.name nof_cinemas += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, name_city, city_obj.city.kid) nofcinemas.append(cinema_id) if cinema_obj: films_block = cinema_tag.find_next_sibling('div') for film_block in films_block.findAll( 'div', id='afishaItem'): film_name = film_block.find( 'div', {'class': 'filmName'}) full_url = film_name.a.get('href').encode( 'utf-8') if film_name.a.text: film_name = film_name.a.text.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = full_url.replace( 'http://kino-teatr.ua/film/', '').replace('.phtml', '').encode('utf-8') if film_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = True if obj and obj.rel_ignore else False if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) if not obj: if kid: uk_url = '%suk/film/%s' % ( source.url, film_id) uk_req = opener.open( urllib2.Request( uk_url)) if uk_req.getcode() == 200: uk_data = BeautifulSoup( uk_req.read( ).decode('utf-8'), from_encoding= "utf-8") uk_name = uk_data.find( 'div', { 'class': 'myriadFilm' }).text.encode( 'utf-8') uk_text = uk_data.find( 'div', itemprop= 'description') uk_text_data = uk_text.findAll( 'p', limit=1) if uk_text_data: uk_text = uk_text_data[ 0].text.encode( 'utf-8') else: uk_text = uk_text.text.encode( 'utf-8').strip( ) uk_text = uk_text.replace( 'редактирование синопсиса', '').strip() obj = create_sfilm( film_id, kid, source, uk_name, txt=uk_text) films[film_id] = obj if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append( obj) else: if film_slug.decode( 'utf-8' ) not in films_slugs: nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append( film_id) if obj: shows = film_block.find( 'div', {'class': 'filmShows'}) for times in shows.findAll( 'a', {'class': 'time'}): try: hours, minutes = times.text.split( ':') except AttributeError: times.find( 'sup').extract() hours, minutes = times.text.split( ':') dtime = datetime.datetime( date['obj'].year, date['obj'].month, date['obj'].day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=obj, cinema=cinema_obj, dtime=dtime, ) schedules.append( sch_id) return nof_films, nof_cinemas, ''
def get_cinema5_schedules(): data_nof_cinema = '' data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://cinema5.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') data = [ { 'city': 'Нижнекамск', 'url': '%snk' % source.url }, { 'city': 'Оренбург', 'url': '%soren' % source.url }, { 'city': 'Саратов', 'url': '%ssaratov' % source.url }, { 'city': 'Уфа', 'url': '%sufa' % source.url }, { 'city': 'Чебоксары', 'url': '%scheby' % source.url }, ] params = ['today', 'tomorrow', '+2days'] cinema_name = 'Синема 5' cinema_slug = low(del_separator(cinema_name)) for i in data: city_slug = low(del_separator(i['city'])) city = City.objects.get(name__name=i['city'], name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': i['city'], }) cinema = None try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, i['city'], city_obj.city.kid) if cinema: cinema_id = '%s_%s' % (cinema_slug, city_slug) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_id, source_obj=source, defaults={ 'source_id': cinema_id, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) for param in params: url = '%s?date=%s' % (i['url'], param) req = urllib.urlopen(url) if req.getcode() == 200: page_data = BeautifulSoup(req.read()) divs = page_data.find('div', {'class': 'content clearfix'}) showdate = divs.find('h1') if showdate: showdate = showdate.string.encode('utf-8') day, month, year = showdate.replace( 'Расписание на ', '').strip().split('.') for div in divs.findAll('div', {'class': 'show-wrapper'}): film_name = div.find('div', { 'class': 'title' }).string.encode('utf-8') film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: for span in div.findAll( 'span', {'class': 'time'}): hours, minutes = span.string.strip( ).split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_id, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinoteatrua_films_and_persons(): ''' Получение фильмов ''' opener = give_me_cookie() source = ImportSources.objects.get(url='http://kino-teatr.ua/') sfilm_clean(source) try: with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") except IOError: xml_data = BeautifulSoup('', from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] data_nof_film = '' persons_dict = {} data_nof_persons = '' films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) year = datetime.datetime.now().year lang = Language.objects.get(name='Украинский') def get_persons(data): persons = {} tags = ['director', 'actor'] for tag in tags: for p in data.findAll('span', itemprop=tag): person_id = p.a.get('href') person_id = long(re.findall(r'\d+', person_id)[0]) if p.a.text: persons[person_id] = p.a.text.encode('utf-8') return persons films_urls = get_kinoteatrua_films_links( 'http://kino-teatr.ua/films-near.phtml', 1, year, source, opener) for ind, film in enumerate(films_urls): film_ua_url = film['url'].replace(source.url, '%suk/' % source.url) req_text = opener.open(urllib2.Request(film_ua_url)) if req_text.getcode() == 200: film_data = BeautifulSoup(req_text.read(), from_encoding="utf-8") persons = get_persons(film_data) persons_dict[film['id']] = persons name = film_data.find('div', { 'class': 'myriadFilm' }).text.encode('utf-8') name = name.replace('Фільм ', '').strip() text = film_data.find('div', itemprop='description') text_data = text.findAll('p', limit=1) if text_data: text = text_data[0].text.encode('utf-8') else: text = text.text.encode('utf-8').strip() text = text.replace('редактирование синопсиса', '').strip() if text in ('Проект оголошений', 'Підготовка до зйомок'): text = '' film_slug = low(del_separator(film['name'])) temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in ignored and temp_film_slug not in films_slugs: obj = films.get(film['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film['id'], kid, source, name, year=film.get('year'), txt=text) films[film['id']] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: if temp_film_slug not in films_slugs: data_nof_film += xml_noffilm( film['name'], film_slug, None, None, film['id'], info, film['url'].encode('utf-8'), source.id) if objt and not create_new: try: film_text = objt.text.encode('utf-8') except UnicodeDecodeError: film_text = objt.text if film_text != text: objt.text = text objt.save() if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_film) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) cron_success('html', source.dump, 'films', 'Укр. фильмы') # persons persons_nof_list = [] persons_list = [] for ind, film in enumerate(films_urls): req = opener.open(urllib2.Request(film['url'])) if req.getcode() == 200: film_data = BeautifulSoup(req.read(), from_encoding="utf-8") persons = get_persons(film_data) for person_id, person_ru_name in persons.iteritems(): if person_id not in persons_nof_list and person_id not in persons_list: ukr_person = persons_dict.get(film['id']) if ukr_person: ukr_person_name = ukr_person.get(person_id) if ukr_person_name: ukr_person_name_slug = low( del_separator(ukr_person_name)) person_ru_name_slug = low( del_separator(person_ru_name)) person_obj = Person.objects.filter( name__name=person_ru_name_slug).exclude( kid=None) if person_obj.count() == 1: names = [{ 'name': ukr_person_name, 'status': 1 }, { 'name': ukr_person_name_slug, 'status': 2 }] for i in names: name_obj, name_created = NamePerson.objects.get_or_create( name=i['name'], status=i['status'], language=lang, defaults={ 'name': i['name'], 'status': i['status'], 'language': lang, }) if name_obj not in person_obj[0].name.all( ): person_obj[0].name.add(name_obj) else: data_nof_persons += '<person name="%s" slug="%s" code="%s" name_alt="%s" slug_alt="%s"></person>' % ( person_ru_name.replace( '"', "'"), person_ru_name_slug, person_id, ukr_person_name.replace( '"', "'"), ukr_person_name_slug) persons_list.append(person_id) if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_person' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_persons) cron_success('html', source.dump, 'persons', 'Укр. персоны')
def get_kinomagnat_schedules(): ignored = get_ignored_films() data_nof_film = '' data_nof_hall = '' data_nof_cinema = '' noffilms = [] nofhalls = [] city_name = 'Киев' cinema_name = 'Магнат' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinomagnat.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') halls = get_source_data(source, 'hall', 'dict') city = City.objects.get(name__name=city_name, name__status=1) try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: cinema = None data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city.kid) if cinema: city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) cinema_kid = cinema.code city_kid = city.kid today = datetime.date.today() url = '%sseans.html?device=iphone' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) div = data.find('div', {'class': 'contentpaneopen'}) for table in div.findAll('table'): try: day, month = table.find_all_previous("p", limit=2)[1].text.strip().split() except ValueError: try: day, month = table.find_all_previous("p", limit=3)[2].text.strip().split() except ValueError: day, month = table.find_all_previous("p", limit=4)[3].text.strip().split() month = get_month_ua(low(month.encode('utf-8'))) date_sch = datetime.date(today.year, month, int(day)) hall_name = table.findAll('tr', limit=1)[0].text.strip().encode('utf-8') hall_name_slug = low(del_separator(hall_name)) if hall_name_slug not in nofhalls: hall_obj = halls.get(hall_name_slug) if not hall_obj: halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk') if halls_obj.count() == 1: hall_kid = halls_obj[0].kid hall_obj = SourceHalls.objects.create( source_id=hall_name_slug, source_obj=source, cinema=cinema_obj, name=hall_name, kid=hall_kid, ) halls[hall_name_slug] = hall_obj else: id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug) id = id.replace(' ', '') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id) nofhalls.append(hall_name_slug) if hall_obj: for ind, tr in enumerate(table.findAll('tr')): if ind != 0: showtime, film_data = tr.findAll('td', limit=2) hour, minute = showtime.text.strip().encode('utf-8').split(':') dtime = datetime.datetime(date_sch.year, date_sch.month, date_sch.day, int(hour), int(minute)) a = film_data.find('a') film_id = a.get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sch_id = '%s%s%s' % (dtime, hall_obj.id, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, hall=hall_obj.kid, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_rambler_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.rambler.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) noffilms = [] data_nof_films = '' ''' # LOCALHOST f = open('%s/dump_rambler_films.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: if xml: # --- end localhost ''' # SERVER f = open('%s/dump_rambler_index.xml' % settings.API_DUMP_PATH, 'r') xml_index = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() creations = xml_index.find('creations') filenames = [] for i in creations.findAll('file'): filename = i.get('filename') if filename: filenames.append(filename) for i in filenames: url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/%s' % ( RAMBLER_API_KEY, i) req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('creation'): film_id = i.objectid.string if film_id not in noffilms: try: year = int(i.year.string) if i.year.string else None except UnicodeEncodeError: year = None full_url = 'https://kassa.rambler.ru/movie/%s' % film_id name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if year and name_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: try: kid, info = film_identification( name_slug, None, {}, {}, year=year, source=source) except db.backend.Database._mysql.OperationalError: next_step = False if next_step: objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_films += xml_noffilm( name, name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films.replace('&', '&')) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_megamag(): ''' Получение urls фильмов ''' import cookielib def give_me_cookie(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler()) return opener ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://megamag.by/') sfilm_clean(source) megamag_cities_dict = get_source_data(source, 'city', 'dict') megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) cities_data = {} data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] schedules_data = [] opener = give_me_cookie() req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php')) event_dict = {} if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities = data.find('div', id="box-region") for i in cities.findAll('a'): city_name = i.text.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i.get('href').replace( 'http://kinoteatr.megamag.by/index.php?region_id=', '') mcity = megamag_cities_dict.get(city_id) if not mcity: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: mcity = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if mcity: cities_data[city_name] = mcity try: cinemas_tag = data.findAll('td', {'class': 'Cinema_new_box_1_BoxText'}, limit=1)[0] except IndexError: cinemas_tag = None if cinemas_tag: for i in cinemas_tag.findAll('a'): cinema_url = i.get('href') cinema_id = cinema_url.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '') cinema_obj = megamag_cinemas_dict.get(cinema_id) opener = give_me_cookie() try: req2 = opener.open(urllib2.Request(cinema_url)) if req2.getcode() == 200: schedules_page = BeautifulSoup(req2.read(), from_encoding="utf-8") city_name = schedules_page.findAll( 'div', {'class': 'object_param_value'}, limit=1)[0].text.encode('utf-8') city_obj = cities_data.get(city_name) if city_obj: cinema_name = schedules_page.find( 'div', { 'class': 'object_title' }).text.encode('utf-8') cinema_name = cinema_name.replace('"', '').replace( 'Кинотеатр', '') cinema_slug = low(del_separator(cinema_name)) cinema_ig_id = u'%s__%s' % ( cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: cinema_kid = cinema_obj.cinema.code if cinema_kid: for event in schedules_page.findAll( 'td', {'class': 'eventsHeading'}): if event.a.get('name'): ev = event.a['name'].split('_')[1] fname = event.a.text.encode( 'utf-8') fid = event.a.get('href').replace( 'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=', '') event_dict[int(ev)] = { 'name': fname, 'id': int(fid) } links = [] for td in schedules_page.findAll( 'td', {'class': 'main'}): for link in td.findAll('a'): l = link.get('href') if l and 'cPath' in l: links.append(l) schedules_data.append({ 'mcity': city_obj, 'city': city_obj.city, 'mcinema': cinema_obj, 'cinema': cinema_kid, 'schedules': set(links) }) else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) except httplib.HTTPException: pass create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) megamag = get_source_data(source, 'schedule', 'list') for obj in schedules_data: cinema_object = obj['mcinema'] for index, i in enumerate(obj['schedules']): opener = give_me_cookie() try: req3 = opener.open(urllib2.Request(i)) if req3.getcode() == 200: id_schedule = i.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '').encode('utf-8') if id_schedule not in megamag: sch_page = BeautifulSoup(req3.read(), from_encoding="utf-8") tables = sch_page.findAll('table', { 'class': 'Cinema_new_box_2_TemplateCenterPart' }, limit=1)[0] main_table = tables.findAll('table', cellpadding='4', limit=1)[0] tr = main_table.findAll('tr')[1] td = tr.findAll('strong') event_id = id_schedule.split('_')[2] film_data = event_dict.get(int(event_id)) if film_data: film_name = film_data['name'] film_name_slug = low( del_separator(del_screen_type(film_name))) film_id = film_data['id'] if film_id not in noffilms and film_name_slug.decode( 'utf-8') not in ignored: obj = films.get( str(film_id).decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_name_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[str(film_id).decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append(objt) elif not obj: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: dtime_info = td[1].text.encode( 'utf-8').split() year_info = datetime.datetime.now( ).year day_info = int(dtime_info[0]) month_low = low( dtime_info[1].replace(',', '')) month_info = int( get_month(month_low)) time_info = dtime_info[-1].replace( '(', '').replace(')', '').split(':') dtime = datetime.datetime( year_info, month_info, day_info, int(time_info[0]), int(time_info[1]), 0) SourceSchedules.objects.create( source_id=id_schedule, source_obj=source, cinema=cinema_object, film=objt, dtime=dtime, ) except httplib.HTTPException: open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('%s\n' % i) # на каждом 60 обращении к источнику делаю паузу в 2 секунды if (index + 1) % 60 == 0: time.sleep(2.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def tvzavr_ident(): source = ImportSources.objects.get(url='http://www.tvzavr.ru/') sfilm_clean(source) path = '%s/dump_%s_index.xml' % (settings.API_DUMP_PATH, source.dump) data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) with open(path, 'r') as f: data = BeautifulSoup(f.read(), "html.parser") for i in data.findAll('url'): title = i.find('video:video').find('video:title').text.encode('utf-8') slug = low(del_separator(title)) film_id = i.find('tvzavr:video').find('tvzavr:id').text if not 'серия' in slug and film_id not in noffilms: if slug.decode('utf-8') not in ignored: url = i.find('loc').text.encode('utf-8') year = i.find('tvzavr:video').find('tvzavr:year').text obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, title, year=year, extra=url) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(title, slug, None, None, film_id.encode('utf-8'), info, url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def get_kinohod_films(): # print "BEGIN get_kinohod_films()" ignored = get_ignored_films() t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 noffilms = [] source = ImportSources.objects.get(url='http://kinohod.ru/') sfilm_clean(source) kinohod_cities = get_source_data(source, 'city', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_films = '' main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY for city_id in kinohod_cities: try: url = '%s&city=%s' % (main_url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 film_id = str(i['id']).decode('utf-8') year = int( i['productionYear']) if i['productionYear'] else None name_ru = i['title'].encode('utf-8') name_ru_slug = low(del_separator(del_screen_type(name_ru))) full_url = '%smovie/%s/' % (source.url, film_id) name_en = None name_en_slug = None if i['originalTitle']: name_en = i['originalTitle'].encode('utf-8') name_en_slug = low( del_separator(del_screen_type(name_en))) if year and name_ru_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: try: if obj: kid = obj.kid else: kid, info = film_identification( name_ru_slug, name_en_slug, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name_ru, name_alt=name_en, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) cron_data_new += '%s<br />' % name_ru elif not obj: if not name_en: name_en = '*' name_en_slug = '*' data_nof_films += xml_noffilm( name_ru, name_ru_slug, name_en, name_en_slug, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) cron_data_nof += '%s<br />' % name_ru except db.backend.Database._mysql.OperationalError: pass except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт фильмов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'films', 'Фильмы')
def get_rottentomatoes_films(everyday=True): def get_critic(block): critic = block.findAll('div', id="scoreStats", limit=1) if critic: critic = critic[0].findAll('div') average = critic[0].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() reviews = critic[1].findAll('span', limit=2)[1].text.strip() fresh = critic[2].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() rotten = critic[3].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() return '%s;%s;%s;%s' % (average.replace( '/10', ''), reviews, fresh, rotten) else: return 'N/A;0;0;0' ''' critic = block.findAll('p', {'class': 'critic_stats'}, limit=1)[0] average, reviews = critic.findAll('span', limit=2) try: fresh, rotten = reviews.next_sibling.next_sibling.encode('utf-8').strip().split(' | ') except AttributeError: return 'N/A;0;0;0' fresh = fresh.replace('Fresh:','').strip() rotten = rotten.replace('Rotten:','').strip() average = average.string.encode('utf-8').split('/')[0] reviews = reviews.string.encode('utf-8') return '%s;%s;%s;%s' % (average, reviews, fresh, rotten) ''' source = ImportSources.objects.get(url='http://www.rottentomatoes.com/') sfilm_clean(source) noffilms = [] data_nof_film = '' filter = {'source_obj': source} if everyday: today = datetime.datetime.today().date() day7 = today + datetime.timedelta(days=7) today = today - datetime.timedelta(days=30) filter['text__gte'] = today filter['text__lt'] = day7 exists = get_source_data(source, 'film', 'list') films = {} source_films = SourceFilms.objects.filter(**filter) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ignored = get_ignored_films() opener = urllib2.build_opener() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1', } opener.addheaders = headers.items() updated = [] for k, f in films.items(): film_url = '%s%s' % (source.url, k) req = opener.open(film_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") extra = get_critic(data) f.extra = extra f.save() updated.append(k) time.sleep(1) u = 'http://www.rottentomatoes.com/api/private/v1.0/m/list/find?page=1&limit=50&type=opening&minTomato=0&maxTomato=100&minPopcorn=0&maxPopcorn=100&services=&genres=1%3B2%3B4%3B5%3B6%3B8%3B9%3B10%3B11%3B13%3B14%3B18&sortBy=popularity&certified=false' req = opener.open(u) if req.getcode() == 200: data = json.loads(req.read(), encoding="latin-1") for i in data['results']: title = i['title'].encode('utf-8') title_slug = low(del_separator(title)) url = i['url'].lstrip('/') full_url = '%s%s' % (source.url, url) if url not in exists and url not in noffilms: if title_slug.decode( 'utf-8') not in ignored and url not in updated: time.sleep(1) req2 = opener.open(full_url) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") year_block = data2.find('h1', {'class': 'title hidden-xs'}) if not year_block: year_block = data2.find('h1', id='movie-title') year_tmp = year_block.find('span', { 'class': 'h3 year' }).text.encode('utf-8') year = int(year_tmp.replace('(', '').replace(')', '')) release_date = data2.find('td', itemprop="datePublished") if release_date: release_date = release_date.get('content') extra = get_critic(data2) obj = films.get(url) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid obj.extra = extra obj.save() else: kid, info = film_identification(None, title_slug, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(url, kid, source, title, txt=release_date, extra=extra) films[url] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( title, title_slug, None, None, url.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(url) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы, рейтинг')
def get_yovideo(): source = ImportSources.objects.get(url='http://www.yo-video.net/') sfilm_clean(source) today = datetime.datetime.now() french_month = { '1': 'janvier', '2': 'fevrier', '3': 'mars', '4': 'avril', '5': 'mai', '6': 'juin', '7': 'juillet', '8': 'aout', '9': 'septembre', '10': 'octobre', '11': 'novembre', '12': 'decembre', } data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) main_urls = [] for i in range(today.month, 13): m = french_month.get(str(i)) url = '%sfr/sorties/cinema/%s/%s/' % (source.url, today.year, m) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for h2 in data.findAll('h2'): day = h2.findAll('span', limit=1)[0].string.encode('utf-8') time.sleep(1) req2 = urllib.urlopen('%s%s' % (url, day)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") release_date = datetime.date(today.year, int(i), int(day)) for film_block in data2.findAll('div', {'class': 'sfilm'}): film_id = film_block.find('a').get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) name = film_block.find('img').get('alt').encode('utf-8').replace('Film ', '') slug = low(del_separator(name)) if slug.decode('utf-8') not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: kid = None if obj: kid = obj.kid if not kid: req3 = urllib.urlopen(full_url) if req3.getcode() == 200: data3 = BeautifulSoup(req3.read(), from_encoding="utf-8") h3 = data3.find('h3') alter_name = None alter_name_slug = None if h3: alter_name = h3.string.encode('utf-8') alter_name_slug = low(del_separator(alter_name)) kid, info = film_identification(slug, alter_name_slug, {}, {}, source=source) txt = None if not kid: div = data3.find('div', {'class': "filmLeft"}) img_url = div.find('img').get('src').encode('utf-8') details = data3.find('div', {'class': "details"}) director = details.find('span', itemprop="name") if director: director = director.string.encode('utf-8').strip() year = re.findall(ur'Année\s?\: \d+', details.text) if year: year = year[0].encode('utf-8').replace('Année','').replace(':','').strip() txt = '%s;%s;%s;%s' % (full_url.encode('utf-8'), img_url, director, year) kid = None objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) else: if not obj: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Франц.релизы')
def get_cinemaarthall_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Норильск' cinema_name = 'Синема-АРТ-Холл' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://cinemaarthall.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) dates = [] url = '%spage/kino/films/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', id='datachek') for a in show_days.findAll('a'): day = a.get('href').replace('/page/kino/films/&date=', '') dates.append(day) for d in dates: url = '%spage/kino/films/&date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) for div in data.findAll('div', {'class': 'media-block'}): film_name = div.find('h3') if film_name: film_name = film_name.string.encode('utf-8') film_id = div.findAll('a', limit=1)[0].get('href').replace( '/', '').encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%spage/kino/films/%s' % (source.url, film_id) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: div_sess = div.find('div', {'class': 'filmr'}) for t in div_sess.findAll('span'): if t.string: t = t.string.split(',')[0] hours, minutes = t.split(':') day, month, year = d.split('.') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def ivi_ident(): source = ImportSources.objects.get(url='http://antipiracy.ivi.ru/') sfilm_clean(source) ignored = get_ignored_films() REG_YEAR = re.compile(r'(\,\s\d{4}$)|(\s\(\d{4}\)$)') data_nof_film = '' noffilms = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) films_data = [] with open('%s/dump_%s.txt' % (settings.API_DUMP_PATH, source.dump), 'r') as f: ftype = False count = 0 tmp = {} for line in f: try: l = line.strip() if l == 'ФИЛЬМЫ:': ftype = True if ftype: if l: count += 1 if count == 1: tmp['name'] = l elif count == 2: tmp['code'] = l films_data.append(tmp) else: if tmp: tmp = {} count = 0 except ValueError: pass for i in films_data: name = i['name'] code = i['code'] year = REG_YEAR.findall(name) if year: name_clear = REG_YEAR.sub('', name) year = ''.join(year[0]) year = year.replace(',', '').replace('(', '').replace(')', '').strip() else: year = None name_clear = name name_slug = low(del_separator(name_clear)) film_id = low(del_separator(name)) if film_id.decode('utf-8') not in ignored and film_id not in noffilms: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(name_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, txt=code) films[film_id.decode('utf-8')] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(name, name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'players', 'Онлайн плееры')