def cinemate_cc_login(): source = ImportSources.objects.get(url='http://cinemate.cc/') opener = give_me_cookie() url = '%slogin/' % source.url req = opener.open(urllib2.Request(url)) page = BeautifulSoup(req.read(), from_encoding="utf-8") login_form = page.find('form', id="login_form") if login_form: csrf = login_form.find('input', {'name': 'csrfmiddlewaretoken'})['value'] login = '******' passwd = 'P0mk67H2kq' values = urllib.urlencode({ 'csrfmiddlewaretoken': csrf, 'username': login, 'password': passwd, }) # отправка формы авторизации url += '?next=/profile/%s/' % login try: req = opener.open(urllib2.Request(url, values)) except urllib2.HTTPError, error: return {'source': source, 'opener': opener, 'error': error.read()}
def get_imdb_film_list(): source = ImportSources.objects.get(url='http://www.imdb.com/') url = '%scalendar/?region=us' % source.url opener = give_me_cookie() req = opener.open(urllib2.Request(url)) xml = '' ids = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id="main") old_date = '' for h4 in div.findAll('h4'): release = h4.string.encode('utf-8') day, month, year = release.split() month = get_month_en(low(month)) rel_date = '%s-%s-%s' % (year, month, day) xml += '<date v="%s">' % rel_date ul = h4.find_next('ul') for li in ul.findAll('li'): year = li.find('span', {'class': "year_type"}).string.encode('utf-8') if 'documentary' not in low(year): year = re.findall(r'\d+', year) if year: details = li.find('i') if details: details = str(details).encode('utf-8').replace('<i>','').replace('</i>','') details = details.replace('(','').replace(')','') else: details = '' if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details): film_name = li.a.string.encode('utf-8').replace('"', '"').replace('&','&') film_slug = low(del_separator(film_name)) full_url = li.a.get('href').encode('utf-8') imdb_id = full_url.replace('/title/tt', '').replace('/', '') xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date) ids.append(imdb_id) xml += '</date>' ids = ';'.join(set(ids)) xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml) cron_success('html', source.dump, 'films_list', 'Список релизов')
def cinemate_cc_get_links(): source = ImportSources.objects.get(url='http://cinemate.cc/') films = {} source_films = SourceFilms.objects.filter(source_obj=source)[:50] for i in source_films: films[int(i.source_id)] = i torrents = list( CinemateTorrents.objects.filter( film__source_id__in=films.keys()).values_list('go_link_id', flat=True)) opener = give_me_cookie() for source_id, film in films.iteritems(): url = '%smovie/%s/links/#tabs' % (source.url, source_id) req = opener.open(urllib2.Request(url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.find('div', {'class': "table"}) for div in table.findAll('div', {'class': "row delimiter"}): td_div = div.findAll('div') tracker = td_div[2].text.strip().encode('utf-8') quality = td_div[3].text.strip().encode('utf-8') size = td_div[-1].text.strip().encode('utf-8') link_id = div.find('a', { 'class': "icon_t download-link" }).get('href', '').replace('/go/s/', '').replace('/', '') if link_id not in torrents: go_url = '%sgo/s/%s' % (source.url, link_id) go_req = opener.open(urllib2.Request(go_url)) go_data = BeautifulSoup(go_req.read(), from_encoding="utf-8") main = go_data.find('div', {'class': "main"}) a = main.find('a', rel="nofollow").get('href') CinemateTorrents.objects.create( film=film, go_link_id=link_id, link=a, tracker=tracker, quality=quality, file_size=size, ) time.sleep(random.uniform(0.8, 1.2)) cron_success('html', source.dump, 'links', 'Ссылки на трекеры')
def get_imdb_rate(imdb): imdb_votes = None imdb_rate = None imdb = get_imdb_id(imdb) opener = give_me_cookie() url = 'http://www.imdb.com/title/tt%s/' % imdb try: req = opener.open(urllib2.Request(url)) except urllib2.HTTPError: req = None print "http error" if req: data = BeautifulSoup(req.read(), from_encoding="utf-8") # рейтинг imdb_rate = data.find('span', itemprop="ratingValue") if imdb_rate: imdb_rate = float(imdb_rate.text.encode('utf-8')) imdb_votes = data.find('span', itemprop="ratingCount") imdb_votes = int( imdb_votes.text.encode('utf-8').replace(u' ', '').replace(u',', '')) return imdb_rate, imdb_votes
def cinemate_cc_soon(): ''' login = cinemate_cc_login() if login['error']: return HttpResponse(str(login['error'])) else: opener = login['opener'] source = login['source'] ''' source = ImportSources.objects.get(url='http://cinemate.cc/') opener = give_me_cookie() ignored = get_ignored_films() data_nof_film = '' noffilms = [] sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[int(i.source_id)] = i fdict = get_all_source_films(source, source_films) send_msg = False for main_url in ('%smovies/soon' % source.url, '%smovies/cinema' % source.url): req = opener.open(urllib2.Request(main_url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") nav = data.find('div', {'class': "navigation"}) nav_link = nav.findAll('a')[-1] last_page = int(nav_link.get('href').split('?page=')[-1]) if last_page > 10: last_page = 10 film_list = get_cinemate_cc_film(data, source, ignored, noffilms) for page in xrange(2, (last_page + 1)): time.sleep(random.uniform(1.0, 2.5)) url = '%s?page=%s' % (main_url, page) try: req = opener.open(urllib2.Request(url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") film_list += get_cinemate_cc_film(data, source, ignored, noffilms) except urllib2.HTTPError: pass for i in film_list: obj = films.get(i['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(i['slug'], None, {}, {}, year=i['year'], source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(i['id'], kid, source, i['name'], year=i['year'], txt=datetime.datetime.now().date(), extra='new') films[i['id']] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) send_msg = True elif not obj: data_nof_film += xml_noffilm(i['name'], i['slug'], None, None, i['id'], info, i['url'].encode('utf-8'), source.id) noffilms.append(i['id']) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы в сети') if send_msg: current_site = DjangoSite.objects.get(domain='kinoinfo.ru') msg_from = Profile.objects.get(user__last_name='SYSTEM') msg_to = Profile.objects.get( accounts__login='******') # [email protected] msg = 'В сети появились новые фильмы <a href="http://kinoinfo.ru/torrents/listing/%s/" target="_blank">http://kinoinfo.ru/torrents/listing/%s/</a>' % ( source.id, source.id) try: dialog_exist = DialogMessages.objects.filter( readers__user=msg_to, readers__message__autor=msg_from).order_by('-id')[0] except IndexError: dialog_exist = None reader_type = '1' msg_obj = News.objects.create( title='Сообщение', text=msg, autor=msg_from, site=current_site, subdomain='0', reader_type='1', ) reader = NewsReaders.objects.create(user=msg_to, status='0', message=msg_obj) if dialog_exist: dialog_exist.readers.add(reader) else: dialog_obj = DialogMessages() dialog_obj.save() dialog_obj.readers.add(reader)
def get_mailru_soon(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='https://afisha.mail.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() dates = list( map((lambda x: today.date() + relativedelta(months=x)), xrange(1, 13))) dates.insert(0, today.date()) for d in dates: main_url = '%scinema/soon/%s/%s/' % (source.url, d.year, d.month) opener = give_me_cookie() #headers = { # 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.2.2; en-us; Nexus 7 Build/JDQ39E) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30 CyanogenMod/10.1.3/grouper', #} #opener.addheaders = headers.items() try: req = opener.open(urllib2.Request(main_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup(req.read(), "html.parser") for block in data.findAll('div', {'class': 'premiere__date'}): day = block.find('div', {'class': 'premiere__date__mday'}).text if day: release_date = datetime.date(d.year, d.month, int(day)) for item in block.findAll('div', {'class': 'clearin'}): a = item.find('div', { 'class': 'itemevent__head__name' }).find('a') film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) href = a.get('href') film_id = href.replace('/cinema/movies/', '').replace('/', '').encode('utf-8') full_url = '%s%s' % (source.url, href.lstrip('/')) details = item.find('div', { 'class': 'itemevent__head__info' }).text.encode('utf-8') year = re.findall(r'\/\d{4}\/', details) if year: year = int(year[0].replace('/', '')) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) #OFC76 path from U+2009|e2 80 89|THIN SPACE #in film name film_slug = film_slug.decode("utf-8").replace( u"\u2009", '').encode("utf-8") next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'source_obj': source, 'release': release_date, }) if sr_created: try: req = opener.open( urllib2.Request(full_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup( req.read(), "html.parser") movie_pic = data.find( 'div', { 'class': 'movieabout__info__left' }) pic = None if movie_pic: pic = movie_pic.find( 'a', { 'data-module': 'Gallery' }).get('href') txt = None movie_txt = data.find( 'div', { 'class': 'movieabout__info__descr__txt' }) if movie_txt: txt = movie_txt.text.strip( ).encode('utf-8') if pic or txt: objt.text = txt objt.extra = pic objt.save() time.sleep(random.uniform(1.0, 1.5)) else: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() time.sleep(random.uniform(1.0, 2.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Релизы')
def create_film_by_imdb_id(imdb): distr_nof_data = '' data_nof_persons = '' nof_distr = [] nof_persons = [] source = ImportSources.objects.get(url='http://www.imdb.com/') film_object = {} films = {} genres_data = {} for i in Genre.objects.all(): if i.name_en: genres_data[i.name_en] = i country_data = {} for i in Country.objects.all(): if i.name_en: country_data[i.name_en] = i productions = {} for i in ProductionsCo.objects.all(): productions[i.name] = i persons_data = {} for i in Person.objects.exclude(Q(iid=None) | Q(iid=0) | Q(kid=None)): persons_data[i.iid] = i distr_names = {} for i in NameDistributors.objects.filter(status=2, distributors__usa=True).values('distributors', 'name'): distr_names[int(i['distributors'])] = i['name'].encode('utf-8') distr_objs = {} for i in Distributors.objects.filter(usa=True): dname = distr_names.get(i.id, '') distr_objs[dname] = i images = list(Images.objects.all().values_list('file', flat=True)) language = Language.objects.get(pk=2) count = 0 release_format = '0' main_data = {} is_dump = False country_id = 1 release = None check_imdb_rate = False try: film_obj = Films.objects.get(imdb_id=imdb) check_imdb_rate = True except Films.DoesNotExist: # создаем в БД киноинфо count, film_obj, distr_nof_data, data_nof_persons, nof_distr, nof_persons = parse_imdb(main_data, count, source, imdb, is_dump, images, country_data, genres_data, persons_data, productions, distr_objs, film_object, films, language, distr_nof_data, data_nof_persons, nof_distr, nof_persons, release_format, country_id, release) if film_obj: film_obj.generated = True film_obj.generated_dtime = datetime.datetime.now() film_obj.save() # создаем в БД киноафиши if check_imdb_rate: if film_obj and not film_obj.imdb_votes: opener = give_me_cookie() url = '%stitle/tt%s/' % (source.url, imdb) req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") imdb_rate = data.find('span', itemprop="ratingValue") imdb_votes = None if imdb_rate: imdb_rate = float(imdb_rate.string) imdb_votes = data.find('span', itemprop="ratingCount") imdb_votes = int(imdb_votes.string.replace(u' ', '').replace(u',', '')) film_obj.imdb_votes = imdb_votes film_obj.imdb_rate = imdb_rate film_obj.save() from film.views import film_create_new_func ka_film = None if film_obj: name = NameFilms.objects.get(status=1, language__id=2, films__pk=film_obj.id).name ka_film = film_create_new_func(name, film_obj.year, 1, create=False) ka_film.idalldvd = film_obj.imdb_id ka_film.runtime = film_obj.runtime ka_film.imdb = film_obj.imdb_rate if film_obj.imdb_rate else 0 ka_film.imdb_votes = film_obj.imdb_votes if film_obj.imdb_votes else 0 ka_film.save() film_obj.kid = ka_film.id film_obj.save() return ka_film
def parse_imdb(main_data, count, source, imdb, is_dump, images, country_data, genres_data, persons_data, productions, distr_objs, film_object, films, language, distr_nof_data, data_nof_persons, nof_distr, nof_persons, release_format, country_id, release): limits = { 'G': 0, 'PG': 6, 'PG-13': 12, 'R': 16, 'NC-17': 18, } imdb = get_imdb_id(imdb) opener = give_me_cookie() url = '%stitle/tt%s/' % (source.url, imdb) try: req = opener.open(urllib2.Request(url)) except urllib2.HTTPError: req = None film_obj = None if req: data = BeautifulSoup(req.read(), from_encoding="utf-8") #open('imdb.html','w').write(str(data)) imdb = long(imdb) fname = main_data.get('fname') fslug = main_data.get('fslug') fyear = main_data.get('fyear') details = main_data.get('details','') new_interface = data.find('div', {'class': "title_block"}) if not is_dump: # название if new_interface: fname = data.find('h1', itemprop="name") try: fname.find('span').extract() except AttributeError: pass fname = fname.text.strip().encode('utf-8') else: h1 = data.find('h1', {'class': 'header'}) fname = h1.find('span', itemprop="name").text.strip().encode('utf-8') fslug = low(del_separator(fname)) # год if new_interface: year_tmp = data.find('title').text.replace(u' - IMDb','') # если такого вида 'The Expanse (2015)' year = re.findall(r'\(\d{4}\)$', year_tmp) if year: fyear = year[0].replace('(','').replace(')','').strip() else: # если такого вида 'The Expanse (TV Series 2015– )' year = re.findall(r'\(.*\d{4}.*\)$', year_tmp) if year: year = re.findall(r'\d{4}', year[0].strip()) fyear = year[0] if year else fyear else: year = h1.find('span', {'class': 'nobr'}) if year: if year.find('a'): year = year.find('a').text.encode('utf-8').strip() else: year = year.text.encode('utf-8').replace('(','').replace(')','').split('–')[0].strip() try: fyear = int(year) except ValueError: fyear = int(year.split()[-1]) # дата релиза if not release: url_release = '%sreleaseinfo' % url time.sleep(1.5) req_release = opener.open(urllib2.Request(url_release)) if req_release.getcode() == 200: data_release = BeautifulSoup(req_release.read(), from_encoding="utf-8") table = data_release.find('table', id='release_dates') if table: for ttr in table.findAll('tr'): tds = ttr.findAll('td') td_country = tds[0].find('a').text.encode('utf-8').strip() td_release = tds[1].text.encode('utf-8').strip() td_details = tds[2].text.encode('utf-8').strip() if td_country == 'USA' and '(' not in td_details: try: td_day, td_month, td_year = td_release.split() td_month = get_month_en(low(td_month.encode('utf-8'))) release = datetime.date(int(td_year), int(td_month), int(td_day)) except ValueError: pass # постер if new_interface: poster = data.find('div', {'class': 'poster'}) else: poster = data.find('td', id="img_primary") if poster: poster = poster.find('div', {'class': 'image'}) if poster: if new_interface: poster = poster.find('img', itemprop="image").get('src').split('@._')[0] else: poster = poster.find('img').get('src').split('@._')[0] poster += '@._V1_SX640_SY720_.jpg' poster_name = 'poster__%s' % md5_string_generate('%s%s' % (poster, datetime.datetime.now())) while poster_name.decode('utf-8') in images: poster_name = 'poster__%s' % md5_string_generate('%s%s' % (poster, datetime.datetime.now())) images.append(poster_name.decode('utf-8')) else: poster = None # ограничения if new_interface: title_block = data.find('div', {'class': "title_block"}) limit = title_block.find('meta', itemprop="contentRating") if limit: limit = limit.get('content').encode('utf-8') limit = limits.get(limit) genres_tmp = [gen.text.encode('utf-8') for gen in title_block.findAll('span', itemprop="genre")] div_details = data.find('div', id="titleDetails") runtime = div_details.find('time', itemprop="duration") else: div = data.find('div', {'class': "infobar"}) limit = div.find('span', itemprop="contentRating") if limit: limit = limit.get('content').encode('utf-8') limit = limits.get(limit) genres_tmp = [gen.string.encode('utf-8') for gen in div.findAll('span', itemprop="genre")] runtime = div.find('time', itemprop="duration") if runtime: runtime = runtime.text.strip().encode('utf-8') runtime = re.findall(r'\d+', runtime)[0] # рейтинг imdb_rate = data.find('span', itemprop="ratingValue") imdb_votes = None if imdb_rate: imdb_rate = float(imdb_rate.text.encode('utf-8')) imdb_votes = data.find('span', itemprop="ratingCount") imdb_votes = int(imdb_votes.text.encode('utf-8').replace(u' ', '').replace(u',', '')) # жанры genres = [] if len(genres_tmp) == 1 and genres_tmp[0] == 'Crime': # детектив gen_obj = Genre.objects.get(name='детектив') genres.append(gen_obj) elif 'Action' in genres_tmp and 'Drama' in genres_tmp: # драму не импортируем for genr in genres_tmp: if genr != 'Drama': gen_obj = genres_data.get(genr) genres.append(gen_obj) elif 'Romance' in genres_tmp: if 'Comedy' in genres_tmp: # драму не импортируем for genr in genres_tmp: if genr != 'Drama': gen_obj = genres_data.get(genr) genres.append(gen_obj) elif 'Drama' in genres_tmp: # мелодрама gen_obj = Genre.objects.get(name='мелодрама') genres.append(gen_obj) for genr in genres_tmp: if genr != 'Drama' and genr != 'Romance': gen_obj = genres_data.get(genr) genres.append(gen_obj) else: for genr in genres_tmp: gen_obj = genres_data.get(genr) genres.append(gen_obj) else: for genr in genres_tmp: gen_obj = genres_data.get(genr) genres.append(gen_obj) if 'Horror' in genres_tmp: if not limit or limit < 16: limit = 16 note = None if new_interface: persons = [] persons_block = data.find('div', {'class': "plot_summary_wrapper"}) for pb in persons_block.findAll('span', itemprop="director"): pb_a = pb.find('a') pb_name = pb_a.text.encode('utf-8').strip() if pb_name: pb_id = pb_a.get('href').split('?')[0] pb_id = long(pb_id.replace('/name/nm', '').replace('/', '')) persons.append({'name': pb_name, 'action': 3, 'status': 1, 'id': pb_id}) for pb in persons_block.findAll('span', itemprop="creator"): pb_a = pb.find('a') pb_name = pb_a.text.encode('utf-8').strip() if pb_name: pb_type = pb_a.next_sibling if u'screenplay' in pb_type: pb_id = pb_a.get('href').split('?')[0] pb_id = long(pb_id.replace('/name/nm', '').replace('/', '')) persons.append({'name': pb_name, 'action': 4, 'status': 1, 'id': pb_id}) for pb in persons_block.findAll('span', itemprop="actors"): pb_a = pb.find('a') pb_name = pb_a.text.encode('utf-8').strip() if pb_name: pb_id = pb_a.get('href').split('?')[0] pb_id = long(pb_id.replace('/name/nm', '').replace('/', '')) persons.append({'name': pb_name, 'action': 1, 'status': 1, 'id': pb_id}) budget_obj = None countries = [] production = [] for div in div_details.findAll('div', {'class': "txt-block"}): h4 = div.find('h4') if h4: if h4.text == u'Country:': for a in div.findAll('a'): country_obj = country_data.get(a.text) countries.append(country_obj) elif h4.text == u'Budget:': budget = div budget.find('h4').extract() budget.find('span').extract() budget = budget.text.encode('utf-8').strip() if '$' in budget or '€' in budget: budget = budget.replace(' ', '').replace(',', '').replace('.', '') budget_sum = re.findall(r'\d+\s?', budget)[0] if '$' in budget: budget_cur = '$' elif '€' in budget: budget_cur = '€' if film_object and film_object['obj'].budget: film_object['obj'].budget.budget = int(budget_sum) film_object['obj'].budget.currency = budget_cur film_object['obj'].budget.save() else: budget_obj = Budget.objects.create( budget = int(budget_sum), currency = budget_cur, ) else: budget_obj = None countries = [] production = [] persons = [] for div in data.findAll('div', {'class': "txt-block"}): h4 = div.find('h4') if h4: if h4.string == u'Country:': for a in div.findAll('a'): country_obj = country_data.get(a.string) countries.append(country_obj) elif h4.string == u'Budget:': budget = div budget.find('h4').extract() budget.find('span').extract() budget = budget.text.encode('utf-8').strip() if '$' in budget or '€' in budget: budget = budget.replace(' ', '').replace(',', '').replace('.', '') budget_sum = re.findall(r'\d+\s?', budget)[0] if '$' in budget: budget_cur = '$' elif '€' in budget: budget_cur = '€' if film_object and film_object['obj'].budget: film_object['obj'].budget.budget = int(budget_sum) film_object['obj'].budget.currency = budget_cur film_object['obj'].budget.save() else: budget_obj = Budget.objects.create( budget = int(budget_sum), currency = budget_cur, ) elif h4.string == u'Director:': for d in div.findAll('a'): d_name = d.find('span', itemprop="name") if d_name: d_name = d_name.string d_id = d.get('href').split('?')[0] d_id = long(d_id.replace('/name/nm', '').replace('/', '')) persons.append({'name': d_name, 'action': 3, 'status': 1, 'id': d_id}) elif h4.string == u'Writers:': for w in div.findAll('a'): p_name = w.find('span', itemprop="name") if p_name: p_name = p_name.string p_type = w.next_sibling w_id = w.get('href').split('?')[0] w_id = long(w_id.replace('/name/nm', '').replace('/', '')) if u'screenplay' in p_type: persons.append({'name': p_name, 'action': 4, 'status': 1, 'id': w_id}) elif h4.string == u'Stars:': for s in div.findAll('a'): s_name = s.find('span', itemprop="name") if s_name: s_name = s_name.string s_id = s.get('href').split('?')[0] s_id = long(s_id.replace('/name/nm', '').replace('/','')) persons.append({'name': s_name, 'action': 1, 'status': 1, 'id': s_id}) distributors = [] url2 = '%scompanycredits' % url time.sleep(1.5) req2 = opener.open(urllib2.Request(url2)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") distr_h4 = data2.find('h4', {'name': "distributors"}) if distr_h4: ul = distr_h4.find_next("ul") for link in ul.findAll('a'): distr_name = link.text.encode('utf-8') if distr_name not in nof_distr: distr_details = link.next_sibling.encode('utf-8').strip() if country_id == 1: cntry = 'USA' else: cntry = 'France' if cntry in distr_details and 'theatrical' in distr_details: distr_year = re.findall(r'\d{4}', distr_details) distr_year = distr_year[0] if distr_year else None distributors.append({'year': distr_year, 'name': distr_name}) distr_data = [] if distributors: distributors = sorted(distributors, key=operator.itemgetter('year')) cur_year = distributors[0]['year'] for distrib in distributors: if distrib['year'] == cur_year: distr_slug = low(del_separator(distrib['name'])) distr_obj = distr_objs.get(distr_slug) if distr_obj: distr_data.append(distr_obj) else: distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % (distrib['name'].replace('&', '&'), distr_slug, None) nof_distr.append(distrib['name']) poster_obj = None if poster: time.sleep(1.5) poster_obj = get_imdb_poster(poster, poster_name) person_list = [] for pe in persons: person_id = pe['id'] person_obj = persons_data.get(person_id) if person_obj: person_list.append({'person': person_obj, 'st': pe['status'], 'act': pe['action']}) else: if person_id not in nof_persons: try: person_name = pe['name'].decode('utf8').encode('utf-8') except UnicodeEncodeError: person_name = pe['name'].encode('utf-8') person_slug = low(del_separator(person_name)) data_nof_persons += '<person name="%s" slug="%s" code="%s" name_alt="" slug_alt=""></person>' % (person_name, person_slug, person_id) nof_persons.append(pe['id']) new = False if film_object: if not film_object['obj'].imdb_id: film_object['obj'].imdb_id = imdb if not film_object['obj'].budget and budget_obj: film_object['obj'].budget = budget_obj if film_object['obj'].runtime != runtime: film_object['obj'].runtime = runtime if film_object['obj'].imdb_votes != imdb_votes: film_object['obj'].imdb_votes = imdb_votes film_object['obj'].imdb_rate = imdb_rate if film_object['obj'].year != fyear: film_object['obj'].year = fyear film_object['obj'].save() else: film_obj = Films.objects.create( year = fyear, note = note, runtime = runtime, rated = limit, budget = budget_obj, imdb_id = imdb, imdb_rate = imdb_rate, imdb_votes = imdb_votes, ) film_object = {'releases': [], 'obj': film_obj} new = True if is_dump: films[int(imdb)] = {'obj': film_obj, 'releases': []} if release and release not in film_object['releases']: rel_obj = FilmsReleaseDate.objects.create( release = release, note = details, format = release_format, country_id = country_id, ) film_object['obj'].release.add(rel_obj) if is_dump: films[int(imdb)]['releases'].append(rel_obj.release) if not new: for img in film_object['obj'].images.filter(status=0): img_p = '%s%s' % (settings.MEDIA_ROOT, img.file) try: os.remove(img_p) except OSError: pass film_object['obj'].images.remove(img) img.delete() if poster_obj: film_object['obj'].images.add(poster_obj) film_names = [ {'name': fname, 'status': 1}, {'name': fslug, 'status': 2}, ] for f in film_names: name_obj, name_created = NameFilms.objects.get_or_create( name = f['name'].strip(), status = f['status'], language = language, defaults = { 'name': f['name'].strip(), 'status': f['status'], 'language': language, }) for fn in film_object['obj'].name.all(): if fn.status == f['status'] and fn.language == language: film_object['obj'].name.remove(fn) film_object['obj'].name.add(name_obj) for c in countries: if c: if new: film_object['obj'].country.add(c) else: if c not in film_object['obj'].country.all(): film_object['obj'].country.add(c) for g in genres: if g: if new: film_object['obj'].genre.add(g) else: if g not in film_object['obj'].genre.all(): film_object['obj'].genre.add(g) for pr in production: if pr: if new: film_object['obj'].production.add(pr) else: if pr not in film_object['obj'].production.all(): film_object['obj'].production.add(pr) for pers in person_list: rel_fp, rel_fp_created = RelationFP.objects.get_or_create( person = pers['person'], status_act_id = pers['st'], action_id = pers['act'], films = film_object['obj'], defaults = { 'person': pers['person'], 'status_act_id': pers['st'], 'action_id': pers['act'], 'films': film_object['obj'], }) for dis_data in distr_data: if new: film_object['obj'].distributor.add(dis_data) else: if dis_data not in film_object['obj'].distributor.all(): film_object['obj'].distributor.add(dis_data) film_obj = film_object['obj'] count += 1 return count, film_obj, distr_nof_data, data_nof_persons, nof_distr, nof_persons
def imdb_searching(query, exact=False): params = { 'q': query, 's': 'tt', } if exact: params['exact'] = 'true' body = urllib.urlencode(params) url = 'http://www.imdb.com/find?%s' % body resp, content = httplib2.Http(disable_ssl_certificate_validation=True).request(url) result = [] count = 0 current_year = datetime.datetime.now().year from_year = current_year - 2 to_year = current_year + 2 if resp['status'] == '200': data = BeautifulSoup(content, "html5lib", from_encoding="utf-8") table = data.find('table', {'class': 'findList'}) if table: for tr in table.findAll('tr'): td = tr.find('td', {'class': "result_text"}) year = td.text.encode('utf-8') year = re.findall('\(\d+\)', year) if year: year = int(year[0].replace('(','').replace(')','').strip()) go = True if exact: go = True if year >= from_year and year <= to_year else False if go and year: count += 1 a = td.find('a') aka = td.find('i') aka = aka.string.encode('utf-8') if aka else '' link = a.get('href').split('?')[0] title = a.text.encode('utf-8') opener = give_me_cookie() url2 = 'http://www.imdb.com%s' % link req = opener.open(urllib2.Request(url2)) imdb_id = int(link.replace('/title/tt', '').replace('/','')) persons = [] if req.getcode() == 200: data2 = BeautifulSoup(req.read(), "html5lib", from_encoding="utf-8") #h1 = data2.find('h1', {'class': "header"}) #year = h1.find('span', {'class': "nobr"}).text.encode('utf-8').replace('(','').replace(')','') for div in data2.findAll('div', {'class': "txt-block"}): h4 = div.find('h4') if h4 and h4.string == u'Director:': for d in div.findAll('a'): d_name = d.find('span', itemprop="name") if d_name: persons.append(d_name.string) result.append({'title': title, 'persons': persons, 'link': link, 'year': year, 'aka': aka, 'id': imdb_id}) if count == 5: break return result