Пример #1
0
def cinemate_cc_login():
    source = ImportSources.objects.get(url='http://cinemate.cc/')

    opener = give_me_cookie()

    url = '%slogin/' % source.url

    req = opener.open(urllib2.Request(url))

    page = BeautifulSoup(req.read(), from_encoding="utf-8")

    login_form = page.find('form', id="login_form")

    if login_form:
        csrf = login_form.find('input',
                               {'name': 'csrfmiddlewaretoken'})['value']

        login = '******'
        passwd = 'P0mk67H2kq'

        values = urllib.urlencode({
            'csrfmiddlewaretoken': csrf,
            'username': login,
            'password': passwd,
        })

        # отправка формы авторизации
        url += '?next=/profile/%s/' % login

        try:
            req = opener.open(urllib2.Request(url, values))
        except urllib2.HTTPError, error:
            return {'source': source, 'opener': opener, 'error': error.read()}
Пример #2
0
def get_imdb_film_list():

    source = ImportSources.objects.get(url='http://www.imdb.com/')

    url = '%scalendar/?region=us' % source.url
    
    opener = give_me_cookie()
    req = opener.open(urllib2.Request(url))
    
    xml = ''
    ids = []
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        div = data.find('div', id="main")
        old_date = ''
        for h4 in div.findAll('h4'):
            release = h4.string.encode('utf-8')
            day, month, year = release.split()
            
            month = get_month_en(low(month))
            
            rel_date = '%s-%s-%s' % (year, month, day)

            xml += '<date v="%s">' % rel_date
                
            ul = h4.find_next('ul')
            
            for li in ul.findAll('li'):
                year = li.find('span', {'class': "year_type"}).string.encode('utf-8')
                if 'documentary' not in low(year):
                    year = re.findall(r'\d+', year)
                    if year:
                        details = li.find('i')
                        if details:
                            details = str(details).encode('utf-8').replace('<i>','').replace('</i>','')
                            details = details.replace('(','').replace(')','')
                        else:
                            details = ''
                            
                        if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details):
                            film_name = li.a.string.encode('utf-8').replace('"', '&quot;').replace('&','&amp;')
                            film_slug = low(del_separator(film_name))
                            full_url = li.a.get('href').encode('utf-8')
                            imdb_id = full_url.replace('/title/tt', '').replace('/', '')
                        
                            xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date)
                            ids.append(imdb_id)
                    
            xml += '</date>'
    ids = ';'.join(set(ids))
    xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time

    create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml)
    cron_success('html', source.dump, 'films_list', 'Список релизов')
Пример #3
0
def cinemate_cc_get_links():
    source = ImportSources.objects.get(url='http://cinemate.cc/')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)[:50]
    for i in source_films:
        films[int(i.source_id)] = i

    torrents = list(
        CinemateTorrents.objects.filter(
            film__source_id__in=films.keys()).values_list('go_link_id',
                                                          flat=True))

    opener = give_me_cookie()

    for source_id, film in films.iteritems():

        url = '%smovie/%s/links/#tabs' % (source.url, source_id)
        req = opener.open(urllib2.Request(url))
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        table = data.find('div', {'class': "table"})
        for div in table.findAll('div', {'class': "row delimiter"}):
            td_div = div.findAll('div')
            tracker = td_div[2].text.strip().encode('utf-8')
            quality = td_div[3].text.strip().encode('utf-8')
            size = td_div[-1].text.strip().encode('utf-8')
            link_id = div.find('a', {
                'class': "icon_t download-link"
            }).get('href', '').replace('/go/s/', '').replace('/', '')

            if link_id not in torrents:

                go_url = '%sgo/s/%s' % (source.url, link_id)
                go_req = opener.open(urllib2.Request(go_url))
                go_data = BeautifulSoup(go_req.read(), from_encoding="utf-8")

                main = go_data.find('div', {'class': "main"})

                a = main.find('a', rel="nofollow").get('href')

                CinemateTorrents.objects.create(
                    film=film,
                    go_link_id=link_id,
                    link=a,
                    tracker=tracker,
                    quality=quality,
                    file_size=size,
                )

        time.sleep(random.uniform(0.8, 1.2))

    cron_success('html', source.dump, 'links', 'Ссылки на трекеры')
Пример #4
0
def get_imdb_rate(imdb):
    imdb_votes = None
    imdb_rate = None
    imdb = get_imdb_id(imdb)
    opener = give_me_cookie()
    url = 'http://www.imdb.com/title/tt%s/' % imdb
    try:
        req = opener.open(urllib2.Request(url))
    except urllib2.HTTPError:
        req = None
        print "http  error"
    if req:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        # рейтинг
        imdb_rate = data.find('span', itemprop="ratingValue")

        if imdb_rate:
            imdb_rate = float(imdb_rate.text.encode('utf-8'))
            imdb_votes = data.find('span', itemprop="ratingCount")
            imdb_votes = int(
                imdb_votes.text.encode('utf-8').replace(u' ',
                                                        '').replace(u',', ''))

    return imdb_rate, imdb_votes
Пример #5
0
def cinemate_cc_soon():
    '''
    login = cinemate_cc_login()
    if login['error']:
        return HttpResponse(str(login['error']))
    else:
        opener = login['opener']
        source = login['source']
    '''

    source = ImportSources.objects.get(url='http://cinemate.cc/')

    opener = give_me_cookie()

    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[int(i.source_id)] = i
    fdict = get_all_source_films(source, source_films)

    send_msg = False

    for main_url in ('%smovies/soon' % source.url,
                     '%smovies/cinema' % source.url):

        req = opener.open(urllib2.Request(main_url))

        data = BeautifulSoup(req.read(), from_encoding="utf-8")

        nav = data.find('div', {'class': "navigation"})
        nav_link = nav.findAll('a')[-1]
        last_page = int(nav_link.get('href').split('?page=')[-1])

        if last_page > 10:
            last_page = 10

        film_list = get_cinemate_cc_film(data, source, ignored, noffilms)

        for page in xrange(2, (last_page + 1)):
            time.sleep(random.uniform(1.0, 2.5))
            url = '%s?page=%s' % (main_url, page)
            try:
                req = opener.open(urllib2.Request(url))
                data = BeautifulSoup(req.read(), from_encoding="utf-8")
                film_list += get_cinemate_cc_film(data, source, ignored,
                                                  noffilms)
            except urllib2.HTTPError:
                pass

        for i in film_list:

            obj = films.get(i['id'])
            next_step = checking_obj(obj)

            if next_step:
                if obj:
                    kid = obj.kid
                else:
                    kid, info = film_identification(i['slug'],
                                                    None, {}, {},
                                                    year=i['year'],
                                                    source=source)

                objt = None
                if kid:
                    create_new, objt = unique_func(fdict, kid, obj)
                    if create_new:
                        objt = create_sfilm(i['id'],
                                            kid,
                                            source,
                                            i['name'],
                                            year=i['year'],
                                            txt=datetime.datetime.now().date(),
                                            extra='new')
                        films[i['id']] = objt
                        if not fdict.get(kid):
                            fdict[kid] = {'editor_rel': [], 'script_rel': []}
                        fdict[kid]['script_rel'].append(objt)
                        send_msg = True
                elif not obj:
                    data_nof_film += xml_noffilm(i['name'], i['slug'], None,
                                                 None, i['id'], info,
                                                 i['url'].encode('utf-8'),
                                                 source.id)
                    noffilms.append(i['id'])

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'films', 'Фильмы в сети')

    if send_msg:
        current_site = DjangoSite.objects.get(domain='kinoinfo.ru')

        msg_from = Profile.objects.get(user__last_name='SYSTEM')
        msg_to = Profile.objects.get(
            accounts__login='******')  # [email protected]
        msg = 'В сети появились новые фильмы <a href="http://kinoinfo.ru/torrents/listing/%s/" target="_blank">http://kinoinfo.ru/torrents/listing/%s/</a>' % (
            source.id, source.id)

        try:
            dialog_exist = DialogMessages.objects.filter(
                readers__user=msg_to,
                readers__message__autor=msg_from).order_by('-id')[0]
        except IndexError:
            dialog_exist = None

        reader_type = '1'
        msg_obj = News.objects.create(
            title='Сообщение',
            text=msg,
            autor=msg_from,
            site=current_site,
            subdomain='0',
            reader_type='1',
        )

        reader = NewsReaders.objects.create(user=msg_to,
                                            status='0',
                                            message=msg_obj)

        if dialog_exist:
            dialog_exist.readers.add(reader)
        else:
            dialog_obj = DialogMessages()
            dialog_obj.save()
            dialog_obj.readers.add(reader)
Пример #6
0
def get_mailru_soon():
    data_nof_film = ''
    noffilms = []

    ignored = get_ignored_films()

    source = ImportSources.objects.get(url='https://afisha.mail.ru/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    today = datetime.datetime.today()

    dates = list(
        map((lambda x: today.date() + relativedelta(months=x)), xrange(1, 13)))
    dates.insert(0, today.date())

    for d in dates:

        main_url = '%scinema/soon/%s/%s/' % (source.url, d.year, d.month)

        opener = give_me_cookie()
        #headers = {
        #    'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.2.2; en-us; Nexus 7 Build/JDQ39E) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30 CyanogenMod/10.1.3/grouper',
        #}
        #opener.addheaders = headers.items()

        try:
            req = opener.open(urllib2.Request(main_url))
        except urllib2.HTTPError:
            req = None

        if req:
            data = BeautifulSoup(req.read(), "html.parser")
            for block in data.findAll('div', {'class': 'premiere__date'}):
                day = block.find('div', {'class': 'premiere__date__mday'}).text
                if day:
                    release_date = datetime.date(d.year, d.month, int(day))

                    for item in block.findAll('div', {'class': 'clearin'}):
                        a = item.find('div', {
                            'class': 'itemevent__head__name'
                        }).find('a')
                        film_name = a.text.strip().encode('utf-8')
                        film_slug = low(del_separator(film_name))
                        href = a.get('href')
                        film_id = href.replace('/cinema/movies/',
                                               '').replace('/',
                                                           '').encode('utf-8')
                        full_url = '%s%s' % (source.url, href.lstrip('/'))
                        details = item.find('div', {
                            'class': 'itemevent__head__info'
                        }).text.encode('utf-8')
                        year = re.findall(r'\/\d{4}\/', details)
                        if year:
                            year = int(year[0].replace('/', ''))

                        if film_id not in noffilms and film_slug.decode(
                                'utf-8') not in ignored:

                            obj = films.get(film_id.decode('utf-8'))
                            #OFC76 path from U+2009|e2 80 89|THIN SPACE
                            #in film name
                            film_slug = film_slug.decode("utf-8").replace(
                                u"\u2009", '').encode("utf-8")
                            next_step = checking_obj(obj)

                            if next_step:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        film_slug,
                                        None, {}, {},
                                        year=year,
                                        source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        objt = create_sfilm(
                                            film_id, kid, source, film_name)
                                        films[film_id.decode('utf-8')] = objt
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(objt)
                                elif not obj:
                                    data_nof_film += xml_noffilm(
                                        film_name, film_slug, None,
                                        None, film_id, info,
                                        full_url.encode('utf-8'), source.id)
                                    noffilms.append(film_id)

                                if objt:
                                    sr_obj, sr_created = SourceReleases.objects.get_or_create(
                                        film=objt,
                                        source_obj=source,
                                        defaults={
                                            'film': objt,
                                            'source_obj': source,
                                            'release': release_date,
                                        })
                                    if sr_created:

                                        try:
                                            req = opener.open(
                                                urllib2.Request(full_url))
                                        except urllib2.HTTPError:
                                            req = None

                                        if req:
                                            data = BeautifulSoup(
                                                req.read(), "html.parser")
                                            movie_pic = data.find(
                                                'div', {
                                                    'class':
                                                    'movieabout__info__left'
                                                })
                                            pic = None
                                            if movie_pic:
                                                pic = movie_pic.find(
                                                    'a', {
                                                        'data-module':
                                                        'Gallery'
                                                    }).get('href')

                                            txt = None
                                            movie_txt = data.find(
                                                'div', {
                                                    'class':
                                                    'movieabout__info__descr__txt'
                                                })
                                            if movie_txt:
                                                txt = movie_txt.text.strip(
                                                ).encode('utf-8')

                                            if pic or txt:
                                                objt.text = txt
                                                objt.extra = pic
                                                objt.save()

                                        time.sleep(random.uniform(1.0, 1.5))
                                    else:
                                        if sr_obj.release != release_date:
                                            sr_obj.release = release_date
                                            sr_obj.save()

        time.sleep(random.uniform(1.0, 2.0))

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'films', 'Релизы')
Пример #7
0
def create_film_by_imdb_id(imdb):

    distr_nof_data = ''
    data_nof_persons = ''
    nof_distr = []
    nof_persons = []

    source = ImportSources.objects.get(url='http://www.imdb.com/')
    film_object = {}
    films = {}

    genres_data = {}
    for i in Genre.objects.all():
        if i.name_en:
            genres_data[i.name_en] = i

    country_data = {}
    for i in Country.objects.all():
        if i.name_en:
            country_data[i.name_en] = i

    productions = {}
    for i in ProductionsCo.objects.all():
        productions[i.name] = i

    persons_data = {}
    for i in Person.objects.exclude(Q(iid=None) | Q(iid=0) | Q(kid=None)):
        persons_data[i.iid] = i

    distr_names = {}
    for i in NameDistributors.objects.filter(status=2, distributors__usa=True).values('distributors', 'name'):
        distr_names[int(i['distributors'])] = i['name'].encode('utf-8')

    distr_objs = {}
    for i in Distributors.objects.filter(usa=True):
        dname = distr_names.get(i.id, '')
        distr_objs[dname] = i

    images = list(Images.objects.all().values_list('file', flat=True))

    language = Language.objects.get(pk=2)

    count = 0
    release_format = '0'
    main_data = {}
    is_dump = False
    country_id = 1
    release = None
    check_imdb_rate = False
    
    try:
        film_obj = Films.objects.get(imdb_id=imdb)
        check_imdb_rate = True
    except Films.DoesNotExist:
        # создаем в БД киноинфо
        count, film_obj, distr_nof_data, data_nof_persons, nof_distr, nof_persons = parse_imdb(main_data, count, source, imdb, is_dump, images, country_data, genres_data, persons_data, productions, distr_objs, film_object, films, language, distr_nof_data, data_nof_persons, nof_distr, nof_persons, release_format, country_id, release)
        if film_obj:
            film_obj.generated = True
            film_obj.generated_dtime = datetime.datetime.now()
            film_obj.save()
    
    # создаем в БД киноафиши
    if check_imdb_rate:
        if film_obj and not film_obj.imdb_votes:
            opener = give_me_cookie()
            url = '%stitle/tt%s/' % (source.url, imdb)
            req = opener.open(urllib2.Request(url))

            if req.getcode() == 200:
                data = BeautifulSoup(req.read(), from_encoding="utf-8")

                imdb_rate = data.find('span', itemprop="ratingValue")
                imdb_votes = None
                if imdb_rate:
                    imdb_rate = float(imdb_rate.string)
                    imdb_votes = data.find('span', itemprop="ratingCount")
                    imdb_votes = int(imdb_votes.string.replace(u' ', '').replace(u',', ''))
    
                film_obj.imdb_votes = imdb_votes
                film_obj.imdb_rate = imdb_rate
                film_obj.save()
    
    from film.views import film_create_new_func
    ka_film = None
    if film_obj:
        name = NameFilms.objects.get(status=1, language__id=2, films__pk=film_obj.id).name
        
        ka_film = film_create_new_func(name, film_obj.year, 1, create=False)
        
        ka_film.idalldvd = film_obj.imdb_id
        ka_film.runtime = film_obj.runtime
        ka_film.imdb = film_obj.imdb_rate if film_obj.imdb_rate else 0
        ka_film.imdb_votes = film_obj.imdb_votes if film_obj.imdb_votes else 0
        ka_film.save()
        
        film_obj.kid = ka_film.id
        film_obj.save()

    return ka_film
Пример #8
0
def parse_imdb(main_data, count, source, imdb, is_dump, images, country_data, genres_data, persons_data, productions, distr_objs, film_object, films, language, distr_nof_data, data_nof_persons, nof_distr, nof_persons, release_format, country_id, release):

    limits = {
        'G': 0,
        'PG': 6,
        'PG-13': 12,
        'R': 16,
        'NC-17': 18,
    }
    
    imdb = get_imdb_id(imdb)
    opener = give_me_cookie()
    url = '%stitle/tt%s/' % (source.url, imdb)
    try:
        req = opener.open(urllib2.Request(url))
    except urllib2.HTTPError:
        req = None

    film_obj = None
    if req:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")

        #open('imdb.html','w').write(str(data))

        imdb = long(imdb)
        
        fname = main_data.get('fname')
        fslug = main_data.get('fslug')
        fyear = main_data.get('fyear')
        details = main_data.get('details','')
        
        new_interface = data.find('div', {'class': "title_block"})

        if not is_dump:
            # название
            if new_interface:
                fname = data.find('h1', itemprop="name")
                try:
                    fname.find('span').extract()
                except AttributeError: pass
                fname = fname.text.strip().encode('utf-8')
            else:
                h1 = data.find('h1', {'class': 'header'})
                fname = h1.find('span', itemprop="name").text.strip().encode('utf-8')

            fslug = low(del_separator(fname))
            
            # год
            if new_interface:
                year_tmp = data.find('title').text.replace(u' - IMDb','')
                # если такого вида 'The Expanse (2015)'
                year = re.findall(r'\(\d{4}\)$', year_tmp)
                if year:
                    fyear = year[0].replace('(','').replace(')','').strip()
                else:
                    # если такого вида 'The Expanse (TV Series 2015– )'
                    year = re.findall(r'\(.*\d{4}.*\)$', year_tmp)
                    if year:
                        year = re.findall(r'\d{4}', year[0].strip())
                        fyear = year[0] if year else fyear
            else:
                year = h1.find('span', {'class': 'nobr'})
                if year:
                    if year.find('a'):
                        year = year.find('a').text.encode('utf-8').strip()
                    else:
                        year = year.text.encode('utf-8').replace('(','').replace(')','').split('–')[0].strip()

                    try:
                        fyear = int(year)
                    except ValueError:
                        fyear = int(year.split()[-1])
            
            # дата релиза
            if not release:
                url_release = '%sreleaseinfo' % url
                time.sleep(1.5)
                req_release = opener.open(urllib2.Request(url_release))
                if req_release.getcode() == 200:
                    data_release = BeautifulSoup(req_release.read(), from_encoding="utf-8")

                    table = data_release.find('table', id='release_dates')
                    if table:
                        for ttr in table.findAll('tr'):
                            tds = ttr.findAll('td')
                            td_country = tds[0].find('a').text.encode('utf-8').strip()
                            td_release = tds[1].text.encode('utf-8').strip()
                            td_details = tds[2].text.encode('utf-8').strip()
                            if td_country == 'USA' and '(' not in td_details:
                                try:
                                    td_day, td_month, td_year = td_release.split()
                                    td_month = get_month_en(low(td_month.encode('utf-8')))
                                    release = datetime.date(int(td_year), int(td_month), int(td_day))
                                except ValueError: pass

        
        # постер
        if new_interface:
            poster = data.find('div', {'class': 'poster'})
        else:
            poster = data.find('td', id="img_primary")
            if poster:
                poster = poster.find('div', {'class': 'image'})

        if poster:
            if new_interface:
                poster = poster.find('img', itemprop="image").get('src').split('@._')[0]
            else:
                poster = poster.find('img').get('src').split('@._')[0]

            poster += '@._V1_SX640_SY720_.jpg'
            
            poster_name = 'poster__%s' % md5_string_generate('%s%s' % (poster, datetime.datetime.now()))
            
            while poster_name.decode('utf-8') in images:
                poster_name = 'poster__%s' % md5_string_generate('%s%s' % (poster, datetime.datetime.now()))
            
            images.append(poster_name.decode('utf-8'))
        else:
            poster = None
        
        # ограничения
        if new_interface:
            title_block = data.find('div', {'class': "title_block"})

            limit = title_block.find('meta', itemprop="contentRating")
            if limit:
                limit = limit.get('content').encode('utf-8')
                limit = limits.get(limit)

            genres_tmp = [gen.text.encode('utf-8') for gen in title_block.findAll('span', itemprop="genre")]

            div_details = data.find('div', id="titleDetails")

            runtime = div_details.find('time', itemprop="duration")
        else:
            div = data.find('div', {'class': "infobar"})

            limit = div.find('span', itemprop="contentRating")
            if limit:
                limit = limit.get('content').encode('utf-8')
                limit = limits.get(limit)

            genres_tmp = [gen.string.encode('utf-8') for gen in div.findAll('span', itemprop="genre")]
        
            runtime = div.find('time', itemprop="duration")


        if runtime:
            runtime = runtime.text.strip().encode('utf-8')
            runtime = re.findall(r'\d+', runtime)[0]

        # рейтинг
        imdb_rate = data.find('span', itemprop="ratingValue")
        imdb_votes = None
        if imdb_rate:
            imdb_rate = float(imdb_rate.text.encode('utf-8'))
            imdb_votes = data.find('span', itemprop="ratingCount")
            imdb_votes = int(imdb_votes.text.encode('utf-8').replace(u' ', '').replace(u',', ''))
        

        # жанры
        genres = []
        if len(genres_tmp) == 1 and genres_tmp[0] == 'Crime':
            # детектив
            gen_obj = Genre.objects.get(name='детектив')
            genres.append(gen_obj)
        elif 'Action' in genres_tmp and 'Drama' in genres_tmp:
            # драму не импортируем
            for genr in genres_tmp:
                if genr != 'Drama':
                    gen_obj = genres_data.get(genr)
                    genres.append(gen_obj)
        elif 'Romance' in genres_tmp:
            if 'Comedy' in genres_tmp:
                # драму не импортируем
                for genr in genres_tmp:
                    if genr != 'Drama':
                        gen_obj = genres_data.get(genr)
                        genres.append(gen_obj)
            elif 'Drama' in genres_tmp:
                # мелодрама
                gen_obj = Genre.objects.get(name='мелодрама')
                genres.append(gen_obj)
                for genr in genres_tmp:
                    if genr != 'Drama' and genr != 'Romance':
                        gen_obj = genres_data.get(genr)
                        genres.append(gen_obj)
            else:
                for genr in genres_tmp:
                    gen_obj = genres_data.get(genr)
                    genres.append(gen_obj)
        
        else:
            for genr in genres_tmp:
                gen_obj = genres_data.get(genr)
                genres.append(gen_obj)
            
        if 'Horror' in genres_tmp:
            if not limit or limit < 16:
                limit = 16

        note = None

        if new_interface:
            persons = []
            persons_block = data.find('div', {'class': "plot_summary_wrapper"})
            for pb in persons_block.findAll('span', itemprop="director"):
                pb_a = pb.find('a')
                pb_name = pb_a.text.encode('utf-8').strip()
                if pb_name:
                    pb_id = pb_a.get('href').split('?')[0]
                    pb_id = long(pb_id.replace('/name/nm', '').replace('/', ''))
                    persons.append({'name': pb_name, 'action': 3, 'status': 1, 'id': pb_id})
            
            for pb in persons_block.findAll('span', itemprop="creator"):
                pb_a = pb.find('a')
                pb_name = pb_a.text.encode('utf-8').strip()
                if pb_name:
                    pb_type = pb_a.next_sibling
                    if u'screenplay' in pb_type:
                        pb_id = pb_a.get('href').split('?')[0]
                        pb_id = long(pb_id.replace('/name/nm', '').replace('/', ''))
                        persons.append({'name': pb_name, 'action': 4, 'status': 1, 'id': pb_id})

            for pb in persons_block.findAll('span', itemprop="actors"):
                pb_a = pb.find('a')
                pb_name = pb_a.text.encode('utf-8').strip()
                if pb_name:
                    pb_id = pb_a.get('href').split('?')[0]
                    pb_id = long(pb_id.replace('/name/nm', '').replace('/', ''))
                    persons.append({'name': pb_name, 'action': 1, 'status': 1, 'id': pb_id})

            budget_obj = None
            countries = []
            production = []
            for div in div_details.findAll('div', {'class': "txt-block"}):
                h4 = div.find('h4')
                if h4:
                    if h4.text == u'Country:':
                        for a in div.findAll('a'):
                            country_obj = country_data.get(a.text)
                            countries.append(country_obj)
                    elif h4.text == u'Budget:':
                        budget = div
                        budget.find('h4').extract()
                        budget.find('span').extract()
                        budget = budget.text.encode('utf-8').strip()
                        if '$' in budget or '€' in budget:
                            budget = budget.replace(' ', '').replace(',', '').replace('.', '')
                            
                            budget_sum = re.findall(r'\d+\s?', budget)[0]
                            if '$' in budget:
                                budget_cur = '$'
                            elif '€' in budget:
                                budget_cur = '€'
                            
                            if film_object and film_object['obj'].budget:
                                film_object['obj'].budget.budget = int(budget_sum)
                                film_object['obj'].budget.currency = budget_cur
                                film_object['obj'].budget.save()
                            else:
                                budget_obj = Budget.objects.create(
                                    budget = int(budget_sum),
                                    currency = budget_cur,
                                )
        else:
            budget_obj = None
            countries = []
            production = []
            persons = []
            for div in data.findAll('div', {'class': "txt-block"}):
                h4 = div.find('h4')
                if h4:
                    if h4.string == u'Country:':
                        for a in div.findAll('a'):
                            country_obj = country_data.get(a.string)
                            countries.append(country_obj)
                    elif h4.string == u'Budget:':
                        budget = div
                        budget.find('h4').extract()
                        budget.find('span').extract()
                        budget = budget.text.encode('utf-8').strip()
                        if '$' in budget or '€' in budget:
                            budget = budget.replace(' ', '').replace(',', '').replace('.', '')
                            
                            budget_sum = re.findall(r'\d+\s?', budget)[0]
                            if '$' in budget:
                                budget_cur = '$'
                            elif '€' in budget:
                                budget_cur = '€'
                            
                            if film_object and film_object['obj'].budget:
                                film_object['obj'].budget.budget = int(budget_sum)
                                film_object['obj'].budget.currency = budget_cur
                                film_object['obj'].budget.save()
                            else:
                                budget_obj = Budget.objects.create(
                                    budget = int(budget_sum),
                                    currency = budget_cur,
                                )
                    elif h4.string == u'Director:':
                        for d in div.findAll('a'):
                            d_name = d.find('span', itemprop="name")
                            if d_name:
                                d_name = d_name.string
                                d_id = d.get('href').split('?')[0]
                                d_id = long(d_id.replace('/name/nm', '').replace('/', ''))
                                persons.append({'name': d_name, 'action': 3, 'status': 1, 'id': d_id})
                    elif h4.string == u'Writers:':
                        for w in div.findAll('a'):
                            p_name = w.find('span', itemprop="name")
                            if p_name:
                                p_name = p_name.string
                                p_type = w.next_sibling
                                w_id = w.get('href').split('?')[0]
                                w_id = long(w_id.replace('/name/nm', '').replace('/', ''))
                                if u'screenplay' in p_type:
                                    persons.append({'name': p_name, 'action': 4, 'status': 1, 'id': w_id})
                    elif h4.string == u'Stars:':
                        for s in div.findAll('a'):
                            s_name = s.find('span', itemprop="name")
                            if s_name:
                                s_name = s_name.string
                                s_id = s.get('href').split('?')[0]
                                s_id = long(s_id.replace('/name/nm', '').replace('/',''))
                                persons.append({'name': s_name, 'action': 1, 'status': 1, 'id': s_id})


        distributors = []
        url2 = '%scompanycredits' % url
        time.sleep(1.5)
        req2 = opener.open(urllib2.Request(url2))
        if req2.getcode() == 200:
            data2 = BeautifulSoup(req2.read(), from_encoding="utf-8")
            distr_h4 = data2.find('h4', {'name': "distributors"})
            if distr_h4:
                ul = distr_h4.find_next("ul")
                for link in ul.findAll('a'):
                    distr_name = link.text.encode('utf-8')
                    if distr_name not in nof_distr:
                        distr_details = link.next_sibling.encode('utf-8').strip()
                        
                        if country_id == 1:
                            cntry = 'USA'
                        else:
                            cntry = 'France'

                        if cntry in distr_details and 'theatrical' in distr_details:
                            distr_year = re.findall(r'\d{4}', distr_details)
                            distr_year = distr_year[0] if distr_year else None
                            distributors.append({'year': distr_year, 'name': distr_name})

        distr_data = []
        
        if distributors:
            distributors = sorted(distributors, key=operator.itemgetter('year'))
            cur_year = distributors[0]['year']
            for distrib in distributors:
                if distrib['year'] == cur_year:
                    distr_slug = low(del_separator(distrib['name']))
                    distr_obj = distr_objs.get(distr_slug)
                    if distr_obj:
                        distr_data.append(distr_obj)
                    else:
                        distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % (distrib['name'].replace('&', '&amp;'), distr_slug, None)
                        nof_distr.append(distrib['name'])
        
        poster_obj = None
        if poster:
            time.sleep(1.5)
            poster_obj = get_imdb_poster(poster, poster_name)

        person_list = []
        for pe in persons:
            person_id = pe['id']
            person_obj = persons_data.get(person_id)
            if person_obj:
                person_list.append({'person': person_obj, 'st': pe['status'], 'act': pe['action']})
            else:
                if person_id not in nof_persons:
                    try:
                        person_name = pe['name'].decode('utf8').encode('utf-8')
                    except UnicodeEncodeError:
                        person_name = pe['name'].encode('utf-8')
                    person_slug = low(del_separator(person_name))
                    data_nof_persons += '<person name="%s" slug="%s" code="%s" name_alt="" slug_alt=""></person>' % (person_name, person_slug, person_id)
                    nof_persons.append(pe['id'])
        
        new = False
        if film_object:
            if not film_object['obj'].imdb_id:
                film_object['obj'].imdb_id = imdb
            if not film_object['obj'].budget and budget_obj:
                film_object['obj'].budget = budget_obj
            if film_object['obj'].runtime != runtime:
                film_object['obj'].runtime = runtime
            if film_object['obj'].imdb_votes != imdb_votes:
                film_object['obj'].imdb_votes = imdb_votes
                film_object['obj'].imdb_rate = imdb_rate
            if film_object['obj'].year != fyear:
                film_object['obj'].year = fyear
            film_object['obj'].save()
        else:
            film_obj = Films.objects.create(
                year = fyear,
                note = note,
                runtime = runtime,
                rated = limit,
                budget = budget_obj,
                imdb_id = imdb,
                imdb_rate = imdb_rate,
                imdb_votes = imdb_votes,
            )
            film_object = {'releases': [], 'obj': film_obj}
            new = True
        
        
            if is_dump:
                films[int(imdb)] = {'obj': film_obj, 'releases': []}
                
        
        if release and release not in film_object['releases']:
            rel_obj = FilmsReleaseDate.objects.create(
                release = release,
                note = details,
                format = release_format,
                country_id = country_id,
            )
            film_object['obj'].release.add(rel_obj)
        
            if is_dump:
                films[int(imdb)]['releases'].append(rel_obj.release)
        
        if not new:
            for img in film_object['obj'].images.filter(status=0):
                img_p = '%s%s' % (settings.MEDIA_ROOT, img.file)
                try:
                    os.remove(img_p)
                except OSError: pass
                film_object['obj'].images.remove(img)
                img.delete()
                
        if poster_obj:
            film_object['obj'].images.add(poster_obj)
        
        
        film_names = [
            {'name': fname, 'status': 1},
            {'name': fslug, 'status': 2},
        ]
        for f in film_names:
            name_obj, name_created = NameFilms.objects.get_or_create(
                name = f['name'].strip(),
                status = f['status'],
                language = language,
                defaults = {
                    'name': f['name'].strip(),
                    'status': f['status'],
                    'language': language,
                })
            
            for fn in film_object['obj'].name.all():
                if fn.status == f['status'] and fn.language == language:
                    film_object['obj'].name.remove(fn)
                    
            film_object['obj'].name.add(name_obj)
            
        
        for c in countries:
            if c:
                if new:
                    film_object['obj'].country.add(c)
                else:
                    if c not in film_object['obj'].country.all():
                        film_object['obj'].country.add(c)
            
        for g in genres:
            if g:
                if new:
                    film_object['obj'].genre.add(g)
                else:
                    if g not in film_object['obj'].genre.all():
                        film_object['obj'].genre.add(g)
            
        for pr in production:
            if pr:
                if new:
                    film_object['obj'].production.add(pr)
                else:
                    if pr not in film_object['obj'].production.all():
                        film_object['obj'].production.add(pr)
        
        for pers in person_list:
            rel_fp, rel_fp_created = RelationFP.objects.get_or_create(
                person = pers['person'],
                status_act_id = pers['st'],
                action_id = pers['act'],
                films = film_object['obj'],
                defaults = {
                    'person': pers['person'],
                    'status_act_id': pers['st'],
                    'action_id': pers['act'],
                    'films': film_object['obj'],
                })
        
        for dis_data in distr_data:
            if new:
                film_object['obj'].distributor.add(dis_data)
            else:
                if dis_data not in film_object['obj'].distributor.all():
                    film_object['obj'].distributor.add(dis_data)

        film_obj = film_object['obj']
        count += 1

    return count, film_obj, distr_nof_data, data_nof_persons, nof_distr, nof_persons
Пример #9
0
def imdb_searching(query, exact=False):
    params = {
        'q': query,
        's': 'tt',
    }
    
    if exact:
        params['exact'] = 'true'
        
    body = urllib.urlencode(params)
    
    url = 'http://www.imdb.com/find?%s' % body
    
    resp, content = httplib2.Http(disable_ssl_certificate_validation=True).request(url)
    
    result = []
    
    count = 0
    
    current_year = datetime.datetime.now().year
    
    from_year = current_year - 2
    to_year = current_year + 2

    if resp['status'] == '200':
        data = BeautifulSoup(content, "html5lib", from_encoding="utf-8")
        
        table = data.find('table', {'class': 'findList'})

        if table:

            for tr in table.findAll('tr'):
                td = tr.find('td', {'class': "result_text"})
                
                year = td.text.encode('utf-8')
                year = re.findall('\(\d+\)', year)
                if year:
                    year = int(year[0].replace('(','').replace(')','').strip())

                go = True
                if exact:
                    go = True if year >= from_year and year <= to_year else False

                if go and year:
                    count += 1
                    
                    a = td.find('a')
                    
                    aka = td.find('i')
                    aka = aka.string.encode('utf-8') if aka else ''  
                    
                    link = a.get('href').split('?')[0]
                    title = a.text.encode('utf-8')
                    
                    opener = give_me_cookie()
                    url2 = 'http://www.imdb.com%s' % link
                    req = opener.open(urllib2.Request(url2))
                    
                    imdb_id = int(link.replace('/title/tt', '').replace('/',''))
                    
                    persons = []
                    
                    if req.getcode() == 200:
                        data2 = BeautifulSoup(req.read(), "html5lib", from_encoding="utf-8")

                        #h1 = data2.find('h1', {'class': "header"})
                        #year = h1.find('span', {'class': "nobr"}).text.encode('utf-8').replace('(','').replace(')','')
                        
                        for div in data2.findAll('div', {'class': "txt-block"}):
                            h4 = div.find('h4')
                            if h4 and h4.string == u'Director:':
                                for d in div.findAll('a'):
                                    d_name = d.find('span', itemprop="name")
                                    if d_name:
                                        persons.append(d_name.string)
                    
                    result.append({'title': title, 'persons': persons, 'link': link, 'year': year, 'aka': aka, 'id': imdb_id})
        
                    if count == 5:
                        break
    
    return result