def save_sms_sources(request): ''' Парсер файла sms.txt ''' # получаю объект для русского языка lang = Language.objects.get(pk=1) list_all = [] sms_file = open(rel('sources/sms.txt'), 'r') # получаю данные из файла for line in sms_file.read().split('\n'): listt = [] for i, l in enumerate(line.split('\t')): if i == 1: listt.append(capit(low(l))) elif i == 2: listt.append(l.split(' ')[0]) elif i == 4: listt.append(l) # складываю данные в список list_all.append(listt) sms_file.close() # получаю объект для источника sms source = ImportSources.objects.get(source='SMS') # иду по списку с данными for l in list_all: try: if l[1] != 'ЗАКРЫТ': # очищаю название от спец.символов slug_city = low(del_separator(l[1])) # ищу по очищенному названию try: city = City.objects.get(name__name=slug_city) except City.DoesNotExist: # если не найдено, то ищу по названию из источника try: city = City.objects.get(name__name=l[1]) except City.DoesNotExist: # если не найдено, то ищу по названию из источника в нижнем регистре try: city = City.objects.get(name__name=capit(low(l[1]))) except City.DoesNotExist: city = None if city: # очищаю название от спец.символов slug_cinema = low(del_separator(l[0])) # ищу по очищенному названию try: cinema = Cinema.objects.get(name__name=slug_cinema, city=city.id) except Cinema.DoesNotExist: # если не найдено, то ищу по названию из источника try: cinema = Cinema.objects.get(name__name=l[0], city=city.id) except Cinema.DoesNotExist: cinema = None if cinema: # получаю/создаю залы для этого кинотеатра в этом городе name1 = create_hallname(1, lang, 'без указания зала') name2 = create_hallname(2, lang, 'безуказаниязала') hall = create_hall((name1, name2), 0, 0, cinema) # записываю url источника в БД, для последующего получения данных о сеансах try: HallsSources.objects.get(id_hall=hall, source=source, url_hall_sources=l[2]) except HallsSources.DoesNotExist: HallsSources(id_hall=hall, source=source, url_hall_sources=l[2]).save() else: # если не найден кинотеатр, то запись в лог logger(**{'event': 2, 'code': 2, 'bad_obj': l[0], 'obj1': l[1], 'obj2': l[2], 'extra': city.id}) else: # если не найден город, то запись в лог logger(**{'event': 2, 'code': 1, 'bad_obj': capit(low(l[1])), 'obj2': l[2]}) except IndexError: pass return HttpResponseRedirect(reverse("main_kai"))
def nowru_ident(): source = ImportSources.objects.get(url='http://www.now.ru/') ignored = get_ignored_films() data_nof_film = '' nowru_data = Nowru.objects.filter(kid=None) for i in nowru_data: name_ru_slug = low(del_separator(i.name_ru.encode('utf-8'))) if name_ru_slug.decode('utf-8') not in ignored: name_en_slug = low(del_separator(i.name_en.encode('utf-8'))) kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, year=i.year, source=source) if kid: i.kid = kid i.save() else: if 'slug="%s"' % name_ru_slug not in data_nof_film: name_ru = i.name_ru.encode('utf-8') name_en = i.name_en.encode('utf-8') data_nof_film += xml_noffilm(name_ru, name_ru_slug, name_en, name_en_slug, i.nowru_id, info, None, source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def person_create_func(name_ru, parental, name_en): person_obj = AfishaPersons.objects.using('afisha').create( birth_year = 0, birth_mounth = 0, birth_day = 0, male = 0, national = 0, country_id = 0, imdb = 0 ) person = Person.objects.create(kid = person_obj.id) names_list = [ {'name': name_ru.strip(), 'status': 1, 'lang': 1}, {'name': low(del_separator(name_ru.strip().encode('utf-8'))), 'status': 2, 'lang': 1}, {'name': name_en.strip(), 'status': 1, 'lang': 2}, {'name': low(del_separator(name_en.strip().encode('utf-8'))), 'status': 2, 'lang': 2}, {'name': parental.strip(), 'status': 3, 'lang': 1}, ] for i in names_list: if i['name']: if i['status'] == 1: try: afisha_person_name_create(person_obj, i['name'], i['lang']) except db.backend.Database._mysql.OperationalError: i['name'] = i['name'].encode('ascii', 'xmlcharrefreplace') afisha_person_name_create(person_obj, i['name'], i['lang']) name, created = person_name_create(i['name'], i['lang'], i['status']) person.name.add(name) return person_obj
def get_imdb_film_list(): source = ImportSources.objects.get(url='http://www.imdb.com/') url = '%scalendar/?region=us' % source.url opener = give_me_cookie() req = opener.open(urllib2.Request(url)) xml = '' ids = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id="main") old_date = '' for h4 in div.findAll('h4'): release = h4.string.encode('utf-8') day, month, year = release.split() month = get_month_en(low(month)) rel_date = '%s-%s-%s' % (year, month, day) xml += '<date v="%s">' % rel_date ul = h4.find_next('ul') for li in ul.findAll('li'): year = li.find('span', {'class': "year_type"}).string.encode('utf-8') if 'documentary' not in low(year): year = re.findall(r'\d+', year) if year: details = li.find('i') if details: details = str(details).encode('utf-8').replace('<i>','').replace('</i>','') details = details.replace('(','').replace(')','') else: details = '' if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details): film_name = li.a.string.encode('utf-8').replace('"', '"').replace('&','&') film_slug = low(del_separator(film_name)) full_url = li.a.get('href').encode('utf-8') imdb_id = full_url.replace('/title/tt', '').replace('/', '') xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date) ids.append(imdb_id) xml += '</date>' ids = ';'.join(set(ids)) xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml) cron_success('html', source.dump, 'films_list', 'Список релизов')
def raspishi_relations(): source = ImportSources.objects.get(url='http://распиши.рф/') ignored = get_ignored_films() data_nof_film = '' domain = u'распиши.рф' url = 'http://%s/getfilmxml.php' % domain.encode('idna') req = urllib.urlopen(url) if req.getcode() == 200: films_rid = list( RaspishiRelations.objects.exclude(kid=0).values_list('rid', flat=True)) xml_data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in xml_data.findAll('movie'): id = int(i['id']) if id not in films_rid: name_ru = i.find('name').text.encode('utf-8') name_en = i.find('nameeng').text.encode('utf-8') name_ru = re.sub(r'\(.*?\)', '', name_ru).strip() name_en = re.sub(r'\(.*?\)', '', name_en).strip() name_slug = low(del_separator(del_screen_type(name_ru))) name_en_slug = low(del_separator(del_screen_type(name_en))) if name_slug.decode('utf-8') not in ignored: try: kid, info = film_identification(name_slug, None, {}, {}, source=source) if kid: created = RaspishiRelations.objects.create( rid=id, kid=kid, name_ru=name_ru, name_en=name_en, ) else: data_nof_film += xml_noffilm( name_ru, name_slug, name_en, name_en_slug, id, info, None, source.id) except db.backend.Database._mysql.OperationalError: pass create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Укр. сеансы')
def get_cinemate_cc_film(data, source, ignored, noffilms): flist = [] for div in data.findAll('div', {'class': "movie-brief"}): h3 = div.find('h3') a = h3.find('a') film_url = a.get('href') film_id = int(film_url.replace('/movie/', '').replace('/', '')) film_name = a.text.encode('utf-8') film_slug = low(del_separator(film_name)) if film_slug.decode( 'utf-8') not in ignored and film_id not in noffilms: full_url = '%s%s' % (source.url, film_url.lstrip('/')) film_year = int( h3.find('small').text.encode('utf-8').replace('(', '').replace( ')', '')) next = False ul = div.find('ul') for link in ul.findAll('a'): a_txt = link.text.encode('utf-8').strip() if a_txt == 'Скачать': next = True if next: flist.append({ 'id': film_id, 'name': film_name, 'slug': film_slug, 'year': film_year, 'url': full_url, }) return flist
def imdb_film_ident(): source = ImportSources.objects.get(url='http://www.imdb.com/') films = Films.objects.filter(kid=None) films_ids = [i.imdb_id for i in films] exist_films = Film.objects.using('afisha').filter(idalldvd__in=films_ids) exist_ids = {} for i in exist_films: exist_ids[i.idalldvd] = i.id data_nof_film = '' for i in films: name = None for j in i.name.filter(status=1, language__id=2): name = j.name.encode('utf-8') slug = low(del_separator(name)) kid = exist_ids.get(long(i.imdb_id)) if kid: i.kid = kid i.save() else: full_url = '%stitle/tt%s/' % (source.url, i.imdb_id) data_nof_film += xml_noffilm(name, slug, None, None, i.imdb_id, 'Фильм не найден', full_url.encode('utf-8'), source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films_ident', 'Идентификация')
def imdb_search2(imdb_id, name, year, kid): film_name = name slug = low(del_separator(film_name.encode('utf-8'))) film_name = film_name.encode('ascii', 'xmlcharrefreplace') xml = '<film n="%s" s="%s" y="%s" id="%s" d="" r=""></film>' % (film_name, slug, str(year).encode('utf-8'), str(imdb_id).encode('utf-8')) data = exp_film_data(imdb_id) if data: if data.get('double'): return simplejson.dumps(data) else: if not data['kid']: pass elif int(data['kid']) != int(kid): return simplejson.dumps({'status': True, 'redirect': True, 'kid': data['kid']}) data_nof_persons, distr_nof_data, dump, good = get_imdb_data(xml, False, 1, [int(imdb_id),], True, kid) if good: data = exp_film_data(imdb_id) if not data: data = {'status': False} else: data = {'status': False} if kid: cache.delete_many(['get_film__%s' % kid, 'film__%s__fdata' % kid]) return simplejson.dumps(data)
def part_search(model, var, name, extra): ''' Поиск по частичному совпадению ''' res_list = [] # очищаю от формата изображения (3D, 2D ...), привожу в нижний регистр clear_name = low(del_screen_type(name.encode('utf-8'))) # заменяю некоторые символы на символ пробела clear_name = clear_name.replace('(', ' ').replace(')', ' ').replace(',', ' ').replace('.', ' ').replace(':', ' ') # для городов/фильмов минимальное кол-во символов в с строке поиска должно быть не менее 3, # а для кинотеатров 2, иначе выведу сообщение об ошибке max_char = 3 if var == 1 or var == 2 else 2 # разбиваю название по пробелу и произвожу поиск по каждой части названия for cn in clear_name.split(' '): if len(cn.decode('utf-8')) >= max_char: #result = model.objects.filter(name__name__contains=cn).distinct() # для новой БД # для старой БД ---- #if var == 1: # result = model.objects.filter(name__contains=cn) #else: result = model.objects.filter(name__name__contains=cn).distinct() result = search(model, var, cn, extra) # ------------------ # результаты складываю в список и возвращаю for i in result: res_list.append(i) return res_list
def person_name_detect(request, ru, en): try: film_editor = is_film_editor(request) if film_editor: name = escape(strip_tags(ru)).encode('utf-8').strip() en = escape(strip_tags(en)).encode('utf-8').strip() slug_ru = low(del_separator(name)) slug_en = low(del_separator(en)) queries = [] if name: queries.append(Q(name__icontains=slug_ru, status=1)) if en: queries.append(Q(name__icontains=en, status=1)) query = queries.pop() for item in queries: query |= item data = list(NamePerson.objects.filter(query, language__id__in=(1,2), person__kid__gt=0).values('language', 'person__kid', 'name')) names = {} for i in data: if not names.get(i['person__kid']): names[i['person__kid']] = {'ru': '', 'en': '', 'id': i['person__kid']} if i['language'] == 1: names[i['person__kid']]['ru'] = i['name'] elif i['language'] == 2: names[i['person__kid']]['en'] = i['name'] names = sorted(names.values(), key=operator.itemgetter('ru')) txt = '' for i in names: txt += u'<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://kinoinfo.ru/person/%s/" target="_blank">%s / %s</a></div>' % (i['id'], i['ru'], i['en']) if txt: txt = u'В базе есть похожие персоны:<br />%s' % txt return simplejson.dumps({ 'status': True, 'content': txt, }) except Exception as e: open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
def organization_uni_temp(request): orgs = Organization.objects.all().order_by('name') for i in orgs: result = unidecode(i.name) result = re.findall(ur'[a-z0-9]+', low(result)) result = '-'.join(result) if result else '' i.uni_slug = '%s-%s' % (result, i.id) i.save() return HttpResponse(str())
def get_rutracker_topics_closed(): REG_SIZE = re.compile(r'\[\d+\.?\d+?\s?\w+\]') REG_SLUG = re.compile(ur'[a-zа-я0-9]+') source = ImportSources.objects.get(url='http://rutracker.org/') films = SourceFilms.objects.filter(source_obj=source) films_dict = {} for i in films: films_dict[i.name_alter] = i url = 'http://rutracker.org/forum/index.php?closed=1' req = urllib.urlopen(url) for_del = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="windows-1251") nav = data.find('ul') if nav: for i in nav.findAll('li'): title = i.b.text.strip().encode('utf-8') if ' / ' in title: name_alt = re.findall(REG_SLUG, low(title).decode('utf-8')) name_alt = ''.join(name_alt) obj = films_dict.get(name_alt) if obj: for_del.append(obj.id) SourceFilms.objects.filter(pk__in=set(for_del)).delete() ''' if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") return HttpResponse(str(data)) nav = data.find('div', {'class': 'cl-pg'}) for a in nav.findAll('a'): link = a.get('href').encode('utf-8') if 'start' in link: new_url = '%sforum/%s' % (source.url, link) links.append(new_url) for url in links: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in data.findAll('b'): title = i.text.encode('utf-8').strip() if ' / ' in title: name_alt = re.findall(REG_SLUG, low(title).decode('utf-8')) name_alt = ''.join(name_alt) obj = films_dict.get(name_alt) if obj: obj.delete() ''' cron_success('xml', source.dump, 'films_closed', 'Закрытые фильмы')
def addfile(request, m=None): if m and m != 'm': raise Http404 if request.POST: from user_registration.func import md5_string_generate from api.func import resize_image next = request.POST.get('fnext') new = request.POST.get('new_msg_id') f = request.FILES.get('file') # если есть файл if f and f.size < 5000000: file_format = low(f.name.encode('utf-8')) img_format = re.findall(r'\.(jpg|png|jpeg|bmp|gif)$', file_format) # если подходит формат if img_format: img_path = '%s/%s' % (settings.WF_PATH, 'women') try: os.makedirs(img_path) except OSError: pass # если существует сообщение для файла try: obj = WFOpinion.objects.using('afisha').get(pk=new) except WFOpinion.DoesNotExist: pass else: # если юзер загружающий файл является автором сообщения if obj.user_id == request.profile.kid: img_obj = f.read() img_name = '%s.%s' % (new, img_format[0]) img_path_tmp = '%s/%s' % (img_path, img_name) with open(img_path_tmp, 'wb') as f: f.write(img_obj) resized = resize_image(1000, None, img_obj, 1500) if resized: resized.save(img_path_tmp) if next: if m: return HttpResponseRedirect(reverse('women_forum', kwargs={'m': m, 'topic': next})) else: return HttpResponseRedirect(reverse('women_forum', kwargs={'topic': next})) if m: return HttpResponseRedirect(reverse('women_forum', kwargs={'m': m})) else: return HttpResponseRedirect(reverse('women_forum')) else: raise Http404
def org_slufy_names(request): orgs = Organization.objects.all() for i in orgs: lo = low(i.name.encode('utf-8')) name_slug = re.findall(ur'[a-zа-я0-9]', lo.decode('utf-8')) name_slug = ''.join(name_slug) if name_slug else '' i.slug = name_slug i.save() return HttpResponse(str())
def get_name_film_obj(film): ''' Получение объекта названия фильма ''' # очищаю названия от формата изображения (3D, 2D ...) f = del_screen_type(film) # очищаю названия от спец.символов и привожу в нижний регистр f = low(del_separator(f)) # ищу по очищенному названию try: name = NameProduct.objects.filter(name=f)[0] except IndexError: # если не найден, ищу по названию источника try: name = NameProduct.objects.filter(name=film)[0] except IndexError: # если не найден, ищу по названию источника в нижнем регистре try: name = NameProduct.objects.filter(name=low(film))[0] except IndexError: # если не найден, ищу по названию источника в нижнем регистре с заглавной буквы try: name = NameProduct.objects.filter(name=capit(film))[0] except IndexError: name = None return name
def create_news(request, tags, name, text, reader_type, nick=0, extra=None, visible=None): profile = request.profile current_site = request.current_site subdomain = request.subdomain if request.subdomain else 0 if current_site.domain in ('kinoinfo.ru', 'kinoafisha.ru'): subdomain = 0 language = None if current_site.domain == 'imiagroup.com.au': try: language = Language.objects.get(code=request.current_language) except Language.DoesNotExist: pass tags_list = [] for i in tags: tag = i.strip() t_list = (tag, capit(tag), low(tag)) tag_obj = None for t in t_list: try: tag_obj = NewsTags.objects.get(name=t) break except NewsTags.DoesNotExist: pass if not tag_obj: tag_obj = NewsTags.objects.create(name=t_list[0]) tags_list.append(tag_obj) if visible is None: visible = True if text else False news = News.objects.create( title = name, autor = profile, site = current_site, subdomain = subdomain, language = language, text = text, visible = visible, reader_type = reader_type, autor_nick = nick, extra = extra, ) for i in set(tags_list): news.tags.add(i) NewsTags.objects.filter(news=None).delete() return news
def get_news_tags(request, id, arr): #try: news = News.objects.get(pk=id) if arr: profile = RequestContext(request).get('profile') is_editor = False try: org = OrganizationNews.objects.select_related('organization').get( news=news) if profile in org.organization.editors.all(): is_editor = True except OrganizationNews.DoesNotExist: pass if request.user.is_superuser or is_editor or request.is_admin: arr = set(arr) tags_error = False tags_objs = {} tags_list = [] tags = NewsTags.objects.all() for i in tags: tags_objs[i.name] = i for i in arr: t_list = (i, capit(i).decode('utf-8'), low(i).decode('utf-8')) tag_obj = None for t in t_list: tag_obj = tags_objs.get(t) if tag_obj: break if not tag_obj: tag_obj = NewsTags.objects.create(name=t_list[0]) tags_list.append(tag_obj) org_tags = [i for i in news.tags.all()] for i in org_tags: news.tags.remove(i) for i in tags_list: news.tags.add(i) return simplejson.dumps({ 'status': True, 'err': False, 'content': sorted(arr) }) return simplejson.dumps({'status': False})
def imdb_person_data(id): url = 'http://www.imdb.com/name/nm%s/bio' % id resp, content = httplib2.Http(disable_ssl_certificate_validation=True).request(url) result = {'bio': '', 'birth': '', 'place': '', 'country': '', 'poster': ''} if resp['status'] == '200': data = BeautifulSoup(content, "html5lib", from_encoding="utf-8") table = data.find('table', id="overviewTable") birth_day = 0 birth_month = 0 birth_year = 0 if table: trs = table.findAll('tr') for a in trs[0].findAll('a'): href = a.get('href').encode('utf-8') if '?birth_monthday' in href: birth_day, birth_month = a.text.strip().split() birth_month = get_month_en(low(birth_month)) elif '?birth_year' in href: birth_year = a.text.encode('utf-8') elif '?birth_place' in href: result['place'] = a.text.encode('utf-8') result['country'] = a.text.split(',')[-1].split('[')[0].strip() if birth_day and birth_month and birth_year: result['birth'] = datetime.date(int(birth_year), int(birth_month), int(birth_day)) bio_block = data.find('a', {'name': 'mini_bio'}).find_next('p') if bio_block: result['bio'] = bio_block.text.strip().encode('utf-8') poster = data.find('img', {'class': 'poster'}) if poster: poster = poster.get('src').split('._V1_')[0] poster += '._V1_SX640_SY720_.jpg' result['poster'] = poster return result
def imdb_person_search(request, pid, name, exist): try: from person.views import person_name_create from release_parser.imdb import imdb_person_searching if request.user.is_superuser: name = escape(strip_tags(name)).encode('utf-8').strip() slug = low(del_separator(name)) person = Person.objects.get(pk=pid) # если не было имени (en), то создаю if not exist: if name: exist = True person_names = person.name.all() names = [ {'name': name, 'status': 1}, {'name': slug, 'status': 2}, ] for i in names: name_obj, created = person_name_create(i['name'], i['status'], 2) if name_obj not in person_names: person.name.add(name_obj) if exist: result = imdb_person_searching(name) txt = '' for i in result: txt += '<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://www.imdb.com%s" target="_blank">%s</a> <i>%s</i><br /> <input type="button" value="Выбрать" id="%s" class="imdb_person_list_select" /></div>' % (i['link'].encode('utf-8'), i['title'], i['details'], i['id']) txt += '<br /><div>Или укажите ссылку на страницу персоны IMDb:<br /><input type="text" value="" size="40" class="imdb_person_url" /> <input type="button" value="Искать" class="imdb_person_list_select" /><input type="hidden" value="%s" id="pid" /></div>' % person.id return simplejson.dumps({ 'status': True, 'content': txt, 'query': name, }) return simplejson.dumps({}) except Exception as e: open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
def get_okinoua_cities(): """ Парсинг городов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список городов с таблицы SourceCities в виде списка cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' # Открываем страницу с городами url = '%skinoafisha-kiev/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for ul in page.findAll('ul', {'class': 'blist'}): for li in ul.findAll('li'): id = li.a.get('href').replace('/', '') name = li.a.string.encode('utf-8').strip() name_slug = low(del_separator(name)) # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то if id not in cities_ids: # идентифицируем новый город city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') # если идентифицировали, то записываем в таблицу SourceCities if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', 'okinoua', 'cities', 'Укр. города')
def get_rambler_cities(): source = ImportSources.objects.get(url='http://www.rambler.ru/') cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' ''' # LOCALHOST f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: # --- end localhost ''' # SERVER url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY # dump_rambler_city.xml req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('city'): id = i.cityid.string name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if id not in cities_ids: city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('xml', source.dump, 'cities', 'Города')
def check_tag(tag, id): tags = capit(tag).decode('utf-8') tags = low(tag).decode('utf-8') obj="" try: obj = OrganizationTags.objects.get(name=tags) #obj = Organization.objects.get(tags__name=tags) debug_logs("already exists: %s " % (obj)) except MultipleObjectsReturned: debug_logs("MultipleObjectsReturned: %s " % (obj)) except ObjectDoesNotExist: debug_logs("ObjectDoesNotExist %s" % tags.encode('utf-8')) created = OrganizationTags.objects.create(name=tags, group_flag="org_name_tag") tag = OrganizationTags.objects.get(name=tags) org = Organization.objects.get(pk=id) org.tags.add(tag) debug_logs("tag %s, created %s " % (tag, created))
def get_premierzal_cities(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cities = get_source_data(source, 'city', 'list') data_nof_city = '' req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) block = data.find('div', {'class': 'drop'}) for i in block.findAll('a'): city_name = i.text.encode('utf-8').strip() city_id = low(del_separator(city_name)) if city_id.decode('utf-8') not in cities: city = City.objects.filter(name__name=city_id, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_id) cities.append(city_id.decode('utf-8')) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', source.dump, 'cities', 'Города')
def parse_address(data): num, name, atype = ('', '', '') if data: num = re.findall(ur'^\d+\-?\d*\w?\s', data) num = num[0].strip() if num else '' data = re.sub(ur'^\d+\-?\d*\w?\s', '', data) types = { 'st': 70, 'street': 70, 'rd': 71, 'road': 71, 'rds': 71, 'ave': 72, 'avenue': 72, 'hwy': 73, 'dr': 74, 'crt': 75, 'pde': 76 } low_data = low(data) for k, v in types.iteritems(): tmp = '%s.' % k if k in low_data or tmp in low_data: atype = v if not atype: atype = 70 name = re.sub( r'(Avenue|avenue|Ave|ave|Street|street|Road|road|Rds|Hwy|hwy|Crt|crt|Pde|pde|St|st|Rd|rd|Dr)\.?$', '', data) return num, name.strip(), atype
def organizations_doubles(request): orgs = Organization.objects.all().order_by('name') unique = {} double = [] txt = '' for i in orgs: result = unidecode(i.name) result = re.findall(ur'[a-z0-9]+', low(result)) result = '-'.join(result) if result else '' obj = unique.get(result) if obj: if obj not in double: txt += '%s <a href="http://kinoinfo.ru/organizations/show/%s" target="_blank">(ID %s)</a><br />' % (obj.name.encode('utf-8'), obj.id, obj.id) double.append(obj) double.append(i) txt += '%s <a href="http://kinoinfo.ru/organizations/show/%s" target="_blank">(ID %s)</a><br />' % (i.name.encode('utf-8'), i.id, i.id) else: unique[result] = i count = len(double) txt = '<b>Всего %s</b><br /><br />%s' % (count, txt) return HttpResponse(str(txt))
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_premierzal_cinemas(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cinemas = get_source_data(source, 'cinema', 'list') cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = {} for i in Cinema.objects.all(): cinemas_dict[i.code] = i ignored_cinemas = get_ignored_cinemas() data_nof_cinema = '' city = cities_dict.values()[0] body = urllib.urlencode({ 'city': city.name.encode('utf-8'), }) url = '%stheatres?%s' % (source.url, body) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) blocks = [] block1 = data.find('div', {'class': 'this_city_theatres'}) block2 = data.find('div', {'class': 'other_city_theatres'}) if block1: blocks.append(block1) if block2: blocks.append(block2) for ind, block in enumerate(blocks): for a in block.findAll('a'): cinema_name = a.text.encode('utf-8').strip().replace('"', '') cinema_id = a.get('href').replace('/theatres/', '').replace('/', '') if ind == 0: city_obj = city else: city_name, cinema_name = cinema_name.split(',') cinema_name = cinema_name.strip() city_slug = low(del_separator(city_name.strip())) city_obj = cities_dict.get(city_slug.decode('utf-8')) cinema_slug = low(del_separator(cinema_name)) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_id.decode( 'utf-8' ) not in cinemas and cinema_ig_id not in ignored_cinemas: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema = cinema_identification(cinema_slug, filter1) cin_obj = cinemas_dict.get(cinema) if cin_obj: SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cin_obj, name=cinema_name, ) cinemas.append(cinema_id.decode('utf-8')) else: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
def get_premierzal_schedules(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.premierzal.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cities_cinemas = {} for i in SourceCinemas.objects.select_related('city').filter( source_obj=source): if not cities_cinemas.get(i.city.source_id): cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []} cities_cinemas[i.city.source_id]['cinemas'].append(i) for k, v in cities_cinemas.iteritems(): city_url_encode = urllib.quote(v['city'].name.encode('utf-8')) for i in v['cinemas']: main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id, city_url_encode) main_req = urllib.urlopen(main_url) if main_req.getcode() == 200: data = BeautifulSoup(main_req.read()) data = data.find('div', id="films-list") if data: dates = [] for calendar in data.findAll('table', {'class': 'calendar'}): for a in calendar.findAll('a'): href = a.get('href', '') href_dict = dict(cgi.parse_qsl(href)) calendar_date = href_dict.get( u'?date', href_dict.get(u'date')) if calendar_date: dates.append({ 'date': calendar_date, 'href': href }) for ind, d in enumerate(dates): films_blocks = [] if ind == 0: films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) else: url = '%s?date=%s&city=%s&theatre=%s' % ( source.url, d['date'], city_url_encode, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) data = data.find('div', id="films-list") films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) time.sleep(random.uniform(0.8, 2.2)) for block in films_blocks: title = block.find('div', { 'class': 'title' }).find('a') film_name = title.text.encode('utf-8').strip() film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: year, month, day = d['date'].split( u'-') for tm in block.findAll( 'div', {'class': 'seanse-item'}): for t in tm.text.encode( 'utf-8').split('|'): t = re.findall( r'\d{2}\:\d{2}', t) if t: hours, minutes = t[ 0].strip().split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % ( dtime, i.source_id.encode( 'utf-8'), film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=i, dtime=dtime, ) schedules.append( sch_id) time.sleep(random.uniform(1.1, 1.8)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_yovideo(): source = ImportSources.objects.get(url='http://www.yo-video.net/') sfilm_clean(source) today = datetime.datetime.now() french_month = { '1': 'janvier', '2': 'fevrier', '3': 'mars', '4': 'avril', '5': 'mai', '6': 'juin', '7': 'juillet', '8': 'aout', '9': 'septembre', '10': 'octobre', '11': 'novembre', '12': 'decembre', } data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) main_urls = [] for i in range(today.month, 13): m = french_month.get(str(i)) url = '%sfr/sorties/cinema/%s/%s/' % (source.url, today.year, m) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for h2 in data.findAll('h2'): day = h2.findAll('span', limit=1)[0].string.encode('utf-8') time.sleep(1) req2 = urllib.urlopen('%s%s' % (url, day)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") release_date = datetime.date(today.year, int(i), int(day)) for film_block in data2.findAll('div', {'class': 'sfilm'}): film_id = film_block.find('a').get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) name = film_block.find('img').get('alt').encode('utf-8').replace('Film ', '') slug = low(del_separator(name)) if slug.decode('utf-8') not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: kid = None if obj: kid = obj.kid if not kid: req3 = urllib.urlopen(full_url) if req3.getcode() == 200: data3 = BeautifulSoup(req3.read(), from_encoding="utf-8") h3 = data3.find('h3') alter_name = None alter_name_slug = None if h3: alter_name = h3.string.encode('utf-8') alter_name_slug = low(del_separator(alter_name)) kid, info = film_identification(slug, alter_name_slug, {}, {}, source=source) txt = None if not kid: div = data3.find('div', {'class': "filmLeft"}) img_url = div.find('img').get('src').encode('utf-8') details = data3.find('div', {'class': "details"}) director = details.find('span', itemprop="name") if director: director = director.string.encode('utf-8').strip() year = re.findall(ur'Année\s?\: \d+', details.text) if year: year = year[0].encode('utf-8').replace('Année','').replace(':','').strip() txt = '%s;%s;%s;%s' % (full_url.encode('utf-8'), img_url, director, year) kid = None objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) else: if not obj: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Франц.релизы')
def parse_data_ident(request, selected): """Функция для идентификации полученных записей """ #try: debug_logs("start ident %s " % selected) # Начинаем отчет времени выполнения фукнции start = time.time() data_nof_film = '' noffilms = [] ignored = get_ignored_films() # Задаем тип идентификации, для передачи в качестве параметра в функцию идентификации ident_type = 'movie_online' # Делаем выборку всех фильмов из базы с пометкой (afisha_id=None), # так помечаются все фильмы при парсинге, # это фильмы, которые не разу не проходифшие идентификацию киноафиши data = MovieMegogo.objects.filter(afisha_id__in=(0, None)) # Получаем необходимые для идентификации параметры, # проходим итерациями в цикле для каждого отдельного фильма for i in data: year = i.year name_ru = i.title name_en = i.title_en country = i.country # Отчищаем названия ru en для идентификации фильма name_ru_slug = del_separator(low(name_ru)) name_en_slug = del_separator(low(name_en)) # Задаем диапазон лет для идентификации фильма new_year = year + 2 old_year = year - 2 filter_year = {'year__gte': old_year, 'year__lte': new_year} try: # Передаем фильм в функцию на идентификацию kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, filter_year, ident_type, country) if kid: # Записываем результат в модель i.afisha_id = kid i.save() else: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, info, i.page.encode('utf-8')) noffilms.append(i.megogo_id) except db.backend.Database._mysql.OperationalError: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, None, i.page.encode('utf-8')) noffilms.append(i.megogo_id) # Время выполнения функции finish = time.time() timer = "%.2f мин" % ((float(finish - start)) / 60) debug_logs("finish") debug_logs("timer: %s " % timer) debug_logs("Идентификация: название %s / инфо %s %s" % (name_ru_slug, kid, info)) source = ImportSources.objects.get(url='http://megogo.net/') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) # Возвращаемся в интерфейс return simplejson.dumps({ 'request_type': 1, 'timer': timer, })