def reviewscrawler(filmid): # the_f_log = codecs.open('E:\GitHub\Filmdia\statistics\log/logReview.txt', 'a', 'utf-8') path_url = 'http://www.imdb.com/title/' + filmid + '/reviews' # print path_url # temsoup = page_read.page_read(path_url, the_f_log) temsoup = page_read.page_read_nolog(path_url) try: num_str = temsoup.select_one('.lister').select_one( '.header').div.get_text().strip() reviews_num = get_num(num_str) # only crawl 500 reviews if reviews_num > 500: reviews_num = 500 range_y = reviews_num // 25 key = '' for i in range(range_y): theurl = 'http://www.imdb.com/title/' + filmid + '/reviews/_ajax?ref_=undefined&paginationKey=' + key key = review_page_crawler(filmid, theurl) # print(key) # review_page_crawler(filmid, theurl, the_f_log) except Exception as e: print(e.args) print("maybe no network")
def review_page_crawler(film_id, myurl): soup = page_read.page_read_nolog(myurl) # soup = page_read.page_read(myurl, f_log) key = '' if soup: for item in soup.select('.lister-item-content'): review = dict() review['imdb_filmID'] = film_id if item.select_one('.ipl-ratings-bar'): review['score'] = float( item.select_one( '.ipl-ratings-bar').get_text().strip().split('/')[0]) else: review['score'] = 0 helpfulness_str_split = item.select_one( '.actions').get_text().strip().split(' ') review['helpfulness'] = helpfulness_str_split[ 0] + '/' + helpfulness_str_split[3] review['summary'] = item.select_one('.title').string review['userName'] = item.select_one( '.display-name-date').select_one( '.display-name-link').get_text() review['time'] = convert_time.local_date( item.select_one('.display-name-date').select_one( '.review-date').get_text()) review['userCountry'] = None review['text'] = item.select_one('.text').get_text().strip() # print(review) save.save_review(review) key = soup.select_one(".load-more-data")['data-key'] return key
def maintain_ip(self): self.proxies = list() soup = page_read.page_read_nolog("http://www.free-proxy-list.net/") tbody = soup.find_all('tbody')[0] for row in tbody.find_all('tr'): proxytem = {"http": "http://" + row.find_all('td')[0].get_text() + ":" + row.find_all('td')[1].get_text()} self.proxies.append(proxytem)
def producerscrawler(producer_url, producer_type): producer = dict() producer['type'] = producer_type soup = page_read.page_read_nolog(producer_url) # get producer ID producer_id = producer_url.split('name/')[1].split('/')[0].split('?')[0] producer['producer_id'] = producer_id if soup: # get picture path if soup.select('.image'): if soup.select('.image')[0].img: picture_path = soup.select('.image')[0].img.get('src') producer['image'] = picture_path # get name name = soup.h1.get_text().strip() producer['name'] = name # write filmsid films_id = list() for filmiddiv in soup.select('.knownfor-title'): film_ref = 'http://www.imdb.com/' + filmiddiv.a.get('href') filmid = film_ref.split('title/')[1].split('/')[0] films_id.append(filmid) producer['films'] = films_id save.save_producer(producer) # save to database
def get_click(name): name = name + ' official trailer' the_url = 'http://www.youtube.com/results?search_query=' + urllib.parse.quote( name) soup = page_read.page_read_nolog(the_url) if soup: print(soup) info_line = soup.find_all(id='metadata-line') print(info_line) if len(info_line) >= 2: print(info_line[0].span.get_string()) # t1 = filter_click(soup.select('.yt-lockup-meta-info')[0].get_text()) # t2 = filter_click(soup.select('.yt-lockup-meta-info')[1].get_text()) # if t1 and t2: # if t1[1] > t2[1]: # return t1 # else: # return t2 else: print('fail to connect youtube')
from film_update import moviescrawler from db_helper.save import cursor, db def get_exist_list(): exist_tup = cursor.fetchall() exists = list() for a_item in exist_tup: exists.append(a_item[0]) return exists cursor.execute('SELECT imdb_filmID FROM FilmDB') exist_films = get_exist_list() soup = page_read.page_read_nolog( 'http://www.imdb.com/search/title?count=100&' 'groups=oscar_best_picture_winners&title_type=feature&sort=release_date,desc' ) imdb_href = 'http://www.imdb.com' for item in soup.select('.lister-item-header'): # print movieurl the_filmid = item.a.get('href').split('title/')[1].split('/')[0] if the_filmid in exist_films: cursor.execute('''UPDATE FilmDB SET Oscar = 1 WHERE imdb_filmID=%s''', (the_filmid, )) db.commit() else: print(the_filmid) moviescrawler.crawl_imdb(the_filmid, 'Oscar') db.commit()
def crawl_imdb(film_id, film_type, need_update=False): movieurl = 'http://www.imdb.com/title/' + film_id + '/' soup = page_read.page_read_nolog(movieurl) if not soup: return film = dict() # write id and name film['imdb_filmID'] = film_id film_name = soup.find_all(attrs={'itemprop': 'name'})[0].get_text() film['name'] = film_name # write summary and directors,etc. summary_text = soup.select('.summary_text')[0] if summary_text: summary = summary_text.get_text().strip().split("See")[0].strip() film['summary'] = summary else: print('no summary') directors = list() actors = list() for dire in soup.find_all(attrs={'itemprop': 'director'}): directors.append(dire.a['href'].split('name/')[1].split('/?')[0]) for dire in soup.find_all(attrs={'itemprop': 'actors'}): actors.append(dire.a['href'].split('name/')[1].split('/?')[0]) film['directors'] = directors film['actors'] = actors tags_list = soup.select('.see-more.inline.canwrap') pattern_plot = '\s[\s|]' # write plot key words if tags_list: plot_key = re.split(pattern_plot, tags_list[0].get_text().strip()) writefile_plot(plot_key, film) else: print(movieurl + ": no plot key words\n") # write genres if len(tags_list) >= 2: genres = re.split(pattern_plot, tags_list[1].get_text().strip()) writefile_plot(genres, film) else: print(movieurl + ": no genres\n") # write detail if soup.select('.txt-block'): writefile_detail(soup.select('.txt-block'), film) else: print(movieurl + ": no detail\n") if soup.select('.ratingValue'): film['score'] = float( soup.select('.ratingValue')[0].strong.get_text().strip()) film['ratingNum'] = int( soup.find_all( attrs={'itemprop': 'ratingCount'})[0].get_text().replace( ',', '')) # write poster and watchURL if soup.select('.poster'): film['posterURL'] = soup.select('.poster')[0].img.get('src') if soup.select('.slate'): film['filmWatchURL'] = 'http://www.imdb.com' + soup.select( '.slate')[0].a['href'] # write cast cast = '' if soup.select('.cast_list'): table = soup.select('.cast_list')[0] for item in table.find_all('tr'): if not item.has_attr('class'): continue cast += item.find_all(attrs={'itemprop': 'name'})[0].string + ':' i = 0 for character in item.select('.character')[0].find_all('a'): if i == 0: cast += character.string i += 1 else: cast += ',' + character.string cast += '/' film['cast'] = cast # write storyline if soup.find_all(attrs={'itemprop': 'description'}): film['storyline'] = soup.find_all( attrs={'itemprop': 'description'})[0].get_text().strip() # write awards if soup.find_all(attrs={'itemprop': 'awards'}): tem_str = '' for a_str in soup.find_all(attrs={'itemprop': 'awards' })[0].get_text().strip().split('\n'): tem_str += a_str.strip() + ' ' film['award'] = tem_str # write worldgross # film['worldwideGross'] = get_worldgross(movieurl) if film_type == 'Oscar': film['filmType'] = 'Normal' film['Oscar'] = 1 else: film['filmType'] = film_type # write douban_score film['douban_score'] = douban_score.get_score(film_id) # save film if need_update: save.save_film_update(film) else: save.save_film(film) # scrape producer imdbref = 'http://www.imdb.com/' for director in soup.find_all(attrs={'itemprop': 'director'}): director_ref = imdbref + director.a.get('href') producerscrawler.producerscrawler(director_ref, 'Director') for actor in soup.find_all(attrs={'itemprop': 'actors'}): actor_ref = imdbref + actor.a.get('href') producerscrawler.producerscrawler(actor_ref, 'Actor') # scrape review reviewscrawler.reviewscrawler(film_id)
return 0 def get_weekgross(s): for item in s: if item.h4: if item.h4.string: if item.h4.string.startswith('Opening Weekend'): return get_num(item.get_text()) cursor.execute( 'SELECT FilmDB.imdb_filmID ' 'FROM FilmDB,TrailerClick ' 'WHERE FilmDB.imdb_filmID=TrailerClick.imdb_filmID ' 'AND (country=\'USA\'OR country=\'UK\') AND gross>1000000 and openweek_gross is null' ) filmids = get_exist_list() print filmids for filmid in filmids: soup = page_read.page_read_nolog('http://www.imdb.com/title/' + filmid + '/') if soup.select('.txt-block'): weekgross = get_weekgross(soup.select('.txt-block')) if weekgross and weekgross != 0: cursor.execute( '''UPDATE TrailerClick SET openweek_gross=%s WHERE imdb_filmID=%s''', (weekgross, filmid)) db.commit()
def review_page_crawler(film_id, myurl): soup = page_read.page_read_nolog(myurl) # soup = page_read.page_read(myurl, f_log) if soup: contentls = soup.select('#tn15content')[0] for item in contentls.find_all('div'): review = dict() review['imdb_filmID'] = film_id if not item.attrs: if len(item.find_all('img')) > 1: review['score'] = float( item.find_all('img')[1].get('alt').split('/')[0]) count = None review_useful = False for temp in item.stripped_strings: if temp.endswith('review useful:'): review_useful = True break if not review_useful: count = 1 for thestr in item.stripped_strings: if thestr.endswith('review useful:'): review['helpfulness'] = thestr.split( ' ')[0] + '/' + thestr.split(' ')[3] # print('Helpfulness: ' + thestr.split(' ')[0] + '/') # print(thestr.split(' ')[3]) count = 1 elif thestr.startswith('***'): continue elif count == 1: count += 1 review['summary'] = thestr # print('Summary: ' + thestr) elif count == 2: count += 1 # # print(thestr + ' ') elif count == 3: review['userName'] = thestr # print(thestr + ' ') count += 1 elif count == 4: # print(thestr) if thestr.startswith('from'): review['userCountry'] = thestr[5:] count += 1 else: review['time'] = convert_time.local_date(thestr) count += 2 elif count == 5: review['time'] = convert_time.local_date(thestr) pp = item.next_sibling.next_sibling text = '' for line in pp.get_text().strip().split('\n'): text += line review['text'] = text save.save_review(review) return
from crawler_util import page_read from film_update import moviescrawler from db_helper.save import cursor, db i = 0 soup = page_read.page_read_nolog( 'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6') for item in soup.select('.titleColumn'): ref = item.a.get('href').strip() film_id = ref.split('title/')[1].split('/')[0] moviescrawler.crawl_imdb(film_id, 'Top250', 'False') db.commit()