def reviewscrawler(filmid):
    # the_f_log = codecs.open('E:\GitHub\Filmdia\statistics\log/logReview.txt', 'a', 'utf-8')
    path_url = 'http://www.imdb.com/title/' + filmid + '/reviews'
    # print path_url
    # temsoup = page_read.page_read(path_url, the_f_log)
    temsoup = page_read.page_read_nolog(path_url)
    try:
        num_str = temsoup.select_one('.lister').select_one(
            '.header').div.get_text().strip()
        reviews_num = get_num(num_str)

        # only crawl 500 reviews
        if reviews_num > 500:
            reviews_num = 500

        range_y = reviews_num // 25
        key = ''
        for i in range(range_y):
            theurl = 'http://www.imdb.com/title/' + filmid + '/reviews/_ajax?ref_=undefined&paginationKey=' + key
            key = review_page_crawler(filmid, theurl)
            # print(key)
        # review_page_crawler(filmid, theurl, the_f_log)
    except Exception as e:
        print(e.args)
        print("maybe no network")
def review_page_crawler(film_id, myurl):
    soup = page_read.page_read_nolog(myurl)
    # soup = page_read.page_read(myurl, f_log)
    key = ''
    if soup:
        for item in soup.select('.lister-item-content'):
            review = dict()
            review['imdb_filmID'] = film_id

            if item.select_one('.ipl-ratings-bar'):
                review['score'] = float(
                    item.select_one(
                        '.ipl-ratings-bar').get_text().strip().split('/')[0])
            else:
                review['score'] = 0
            helpfulness_str_split = item.select_one(
                '.actions').get_text().strip().split(' ')
            review['helpfulness'] = helpfulness_str_split[
                0] + '/' + helpfulness_str_split[3]
            review['summary'] = item.select_one('.title').string
            review['userName'] = item.select_one(
                '.display-name-date').select_one(
                    '.display-name-link').get_text()
            review['time'] = convert_time.local_date(
                item.select_one('.display-name-date').select_one(
                    '.review-date').get_text())

            review['userCountry'] = None

            review['text'] = item.select_one('.text').get_text().strip()
            # print(review)
            save.save_review(review)
        key = soup.select_one(".load-more-data")['data-key']
    return key
Exemplo n.º 3
0
 def maintain_ip(self):
     self.proxies = list()
     soup = page_read.page_read_nolog("http://www.free-proxy-list.net/")
     tbody = soup.find_all('tbody')[0]
     for row in tbody.find_all('tr'):
         proxytem = {"http": "http://" + row.find_all('td')[0].get_text() + ":" + row.find_all('td')[1].get_text()}
         self.proxies.append(proxytem)
Exemplo n.º 4
0
def producerscrawler(producer_url, producer_type):
    producer = dict()
    producer['type'] = producer_type

    soup = page_read.page_read_nolog(producer_url)

    # get producer ID
    producer_id = producer_url.split('name/')[1].split('/')[0].split('?')[0]
    producer['producer_id'] = producer_id

    if soup:
        # get picture path
        if soup.select('.image'):
            if soup.select('.image')[0].img:
                picture_path = soup.select('.image')[0].img.get('src')
                producer['image'] = picture_path

        # get name
        name = soup.h1.get_text().strip()
        producer['name'] = name
        # write filmsid
        films_id = list()
        for filmiddiv in soup.select('.knownfor-title'):
            film_ref = 'http://www.imdb.com/' + filmiddiv.a.get('href')
            filmid = film_ref.split('title/')[1].split('/')[0]
            films_id.append(filmid)

        producer['films'] = films_id
        save.save_producer(producer)  # save to database
Exemplo n.º 5
0
def get_click(name):
    name = name + ' official trailer'
    the_url = 'http://www.youtube.com/results?search_query=' + urllib.parse.quote(
        name)
    soup = page_read.page_read_nolog(the_url)
    if soup:
        print(soup)
        info_line = soup.find_all(id='metadata-line')
        print(info_line)
        if len(info_line) >= 2:
            print(info_line[0].span.get_string())
            # t1 = filter_click(soup.select('.yt-lockup-meta-info')[0].get_text())
            # t2 = filter_click(soup.select('.yt-lockup-meta-info')[1].get_text())
            # if t1 and t2:
            #     if t1[1] > t2[1]:
            #         return t1
            #     else:
            #         return t2
    else:
        print('fail to connect youtube')
from film_update import moviescrawler
from db_helper.save import cursor, db


def get_exist_list():
    exist_tup = cursor.fetchall()
    exists = list()
    for a_item in exist_tup:
        exists.append(a_item[0])
    return exists


cursor.execute('SELECT imdb_filmID FROM FilmDB')
exist_films = get_exist_list()
soup = page_read.page_read_nolog(
    'http://www.imdb.com/search/title?count=100&'
    'groups=oscar_best_picture_winners&title_type=feature&sort=release_date,desc'
)
imdb_href = 'http://www.imdb.com'
for item in soup.select('.lister-item-header'):
    # print  movieurl
    the_filmid = item.a.get('href').split('title/')[1].split('/')[0]
    if the_filmid in exist_films:
        cursor.execute('''UPDATE FilmDB SET Oscar = 1 WHERE imdb_filmID=%s''',
                       (the_filmid, ))
        db.commit()
    else:
        print(the_filmid)
        moviescrawler.crawl_imdb(the_filmid, 'Oscar')
db.commit()
Exemplo n.º 7
0
def crawl_imdb(film_id, film_type, need_update=False):
    movieurl = 'http://www.imdb.com/title/' + film_id + '/'
    soup = page_read.page_read_nolog(movieurl)
    if not soup:
        return
    film = dict()
    # write id and name
    film['imdb_filmID'] = film_id
    film_name = soup.find_all(attrs={'itemprop': 'name'})[0].get_text()
    film['name'] = film_name

    # write summary and directors,etc.
    summary_text = soup.select('.summary_text')[0]
    if summary_text:
        summary = summary_text.get_text().strip().split("See")[0].strip()
        film['summary'] = summary
    else:
        print('no summary')
    directors = list()
    actors = list()
    for dire in soup.find_all(attrs={'itemprop': 'director'}):
        directors.append(dire.a['href'].split('name/')[1].split('/?')[0])
    for dire in soup.find_all(attrs={'itemprop': 'actors'}):
        actors.append(dire.a['href'].split('name/')[1].split('/?')[0])
    film['directors'] = directors
    film['actors'] = actors

    tags_list = soup.select('.see-more.inline.canwrap')
    pattern_plot = '\s[\s|]'

    # write plot key words
    if tags_list:
        plot_key = re.split(pattern_plot, tags_list[0].get_text().strip())
        writefile_plot(plot_key, film)
    else:
        print(movieurl + ": no plot key words\n")

    # write genres
    if len(tags_list) >= 2:
        genres = re.split(pattern_plot, tags_list[1].get_text().strip())
        writefile_plot(genres, film)
    else:
        print(movieurl + ": no genres\n")

    # write detail
    if soup.select('.txt-block'):
        writefile_detail(soup.select('.txt-block'), film)
    else:
        print(movieurl + ": no detail\n")

    if soup.select('.ratingValue'):
        film['score'] = float(
            soup.select('.ratingValue')[0].strong.get_text().strip())
        film['ratingNum'] = int(
            soup.find_all(
                attrs={'itemprop': 'ratingCount'})[0].get_text().replace(
                    ',', ''))

    # write poster and watchURL
    if soup.select('.poster'):
        film['posterURL'] = soup.select('.poster')[0].img.get('src')
    if soup.select('.slate'):
        film['filmWatchURL'] = 'http://www.imdb.com' + soup.select(
            '.slate')[0].a['href']

    # write cast
    cast = ''
    if soup.select('.cast_list'):
        table = soup.select('.cast_list')[0]
        for item in table.find_all('tr'):
            if not item.has_attr('class'):
                continue
            cast += item.find_all(attrs={'itemprop': 'name'})[0].string + ':'
            i = 0
            for character in item.select('.character')[0].find_all('a'):
                if i == 0:
                    cast += character.string
                    i += 1
                else:
                    cast += ',' + character.string
            cast += '/'
        film['cast'] = cast

    # write storyline
    if soup.find_all(attrs={'itemprop': 'description'}):
        film['storyline'] = soup.find_all(
            attrs={'itemprop': 'description'})[0].get_text().strip()

    # write awards
    if soup.find_all(attrs={'itemprop': 'awards'}):
        tem_str = ''
        for a_str in soup.find_all(attrs={'itemprop': 'awards'
                                          })[0].get_text().strip().split('\n'):
            tem_str += a_str.strip() + ' '
        film['award'] = tem_str
    # write worldgross
    # film['worldwideGross'] = get_worldgross(movieurl)

    if film_type == 'Oscar':
        film['filmType'] = 'Normal'
        film['Oscar'] = 1
    else:
        film['filmType'] = film_type
    # write douban_score
    film['douban_score'] = douban_score.get_score(film_id)
    # save film
    if need_update:
        save.save_film_update(film)
    else:
        save.save_film(film)

    # scrape producer
    imdbref = 'http://www.imdb.com/'
    for director in soup.find_all(attrs={'itemprop': 'director'}):
        director_ref = imdbref + director.a.get('href')
        producerscrawler.producerscrawler(director_ref, 'Director')
    for actor in soup.find_all(attrs={'itemprop': 'actors'}):
        actor_ref = imdbref + actor.a.get('href')
        producerscrawler.producerscrawler(actor_ref, 'Actor')
    # scrape review
    reviewscrawler.reviewscrawler(film_id)
Exemplo n.º 8
0
        return 0


def get_weekgross(s):
    for item in s:
        if item.h4:
            if item.h4.string:
                if item.h4.string.startswith('Opening Weekend'):
                    return get_num(item.get_text())


cursor.execute(
    'SELECT FilmDB.imdb_filmID '
    'FROM FilmDB,TrailerClick '
    'WHERE FilmDB.imdb_filmID=TrailerClick.imdb_filmID '
    'AND (country=\'USA\'OR country=\'UK\') AND gross>1000000 and openweek_gross is null'
)

filmids = get_exist_list()
print filmids
for filmid in filmids:
    soup = page_read.page_read_nolog('http://www.imdb.com/title/' + filmid +
                                     '/')
    if soup.select('.txt-block'):
        weekgross = get_weekgross(soup.select('.txt-block'))
        if weekgross and weekgross != 0:
            cursor.execute(
                '''UPDATE TrailerClick SET openweek_gross=%s WHERE imdb_filmID=%s''',
                (weekgross, filmid))
            db.commit()
def review_page_crawler(film_id, myurl):
    soup = page_read.page_read_nolog(myurl)
    # soup = page_read.page_read(myurl, f_log)
    if soup:
        contentls = soup.select('#tn15content')[0]

        for item in contentls.find_all('div'):
            review = dict()
            review['imdb_filmID'] = film_id
            if not item.attrs:

                if len(item.find_all('img')) > 1:
                    review['score'] = float(
                        item.find_all('img')[1].get('alt').split('/')[0])

                count = None
                review_useful = False
                for temp in item.stripped_strings:
                    if temp.endswith('review useful:'):
                        review_useful = True
                        break
                if not review_useful:
                    count = 1
                for thestr in item.stripped_strings:
                    if thestr.endswith('review useful:'):
                        review['helpfulness'] = thestr.split(
                            ' ')[0] + '/' + thestr.split(' ')[3]
                        # print('Helpfulness: ' + thestr.split(' ')[0] + '/')
                        # print(thestr.split(' ')[3])
                        count = 1
                    elif thestr.startswith('***'):
                        continue
                    elif count == 1:
                        count += 1
                        review['summary'] = thestr
                        # print('Summary: ' + thestr)
                    elif count == 2:
                        count += 1
                        # # print(thestr + ' ')
                    elif count == 3:
                        review['userName'] = thestr
                        # print(thestr + ' ')
                        count += 1
                    elif count == 4:
                        # print(thestr)
                        if thestr.startswith('from'):
                            review['userCountry'] = thestr[5:]
                            count += 1
                        else:
                            review['time'] = convert_time.local_date(thestr)
                            count += 2
                    elif count == 5:
                        review['time'] = convert_time.local_date(thestr)

                pp = item.next_sibling.next_sibling
                text = ''
                for line in pp.get_text().strip().split('\n'):
                    text += line
                review['text'] = text
                save.save_review(review)
    return
from crawler_util import page_read
from film_update import moviescrawler
from db_helper.save import cursor, db

i = 0
soup = page_read.page_read_nolog(
    'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6')
for item in soup.select('.titleColumn'):
    ref = item.a.get('href').strip()
    film_id = ref.split('title/')[1].split('/')[0]
    moviescrawler.crawl_imdb(film_id, 'Top250', 'False')
db.commit()