from film_update import moviescrawler from crawler_util import page_read the_url = 'http://www.imdb.com/search/title?count=100' \ '&release_date=1915-01-01,2017-06-30&title_type=feature&page=' imdb_href = 'http://www.imdb.com' for i in range(1, 100): target = the_url + str(i) soup = page_read.page_read_power(target) if soup: for item in soup.select('.lister-item-header'): movieurl = imdb_href + item.a.get('href') print(str(i) + ': ' + movieurl) the_filmid = item.a.get('href').split('title/')[1].split('/')[0] if the_filmid: moviescrawler.crawl_imdb(the_filmid, 'Normal')
Oscar BOOL, budget INT, gross INT, worldwideGross INT, linear_predict INT, linear_test DOUBLE, lasso_predict INT, lasso_test DOUBLE, knn_predict INT, knn_test DOUBLE, poly_predict INT, poly_test DOUBLE, UNIQUE(imdb_filmID) )DEFAULT CHARSET = utf8''' cursor.execute(create_film) cursor.execute('DELETE FROM UpdateFilm') db.commit() db.close() imdb_href = 'http://www.imdb.com/' soup = page_read.page_read_power(imdb_href) film_type = ('ThisWeek', 'Latest', 'Coming') for i in range(0, 3): for item in soup.select('.aux-content-widget-2')[i].select('.title'): film_id = item.a['href'].split('title/')[1].split('?')[0] moviescrawler.crawl_imdb(film_id, film_type[i], need_update=True)
from crawler_util import page_read from film_update import moviescrawler from db_helper.save import cursor, db i = 0 soup = page_read.page_read_nolog( 'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6') for item in soup.select('.titleColumn'): ref = item.a.get('href').strip() film_id = ref.split('title/')[1].split('/')[0] moviescrawler.crawl_imdb(film_id, 'Top250', 'False') db.commit()
from film_update import moviescrawler from db_helper.save import cursor, db def get_exist_list(): exist_tup = cursor.fetchall() exists = list() for a_item in exist_tup: exists.append(a_item[0]) return exists cursor.execute('SELECT imdb_filmID FROM FilmDB') exist_films = get_exist_list() soup = page_read.page_read_nolog( 'http://www.imdb.com/search/title?count=100&' 'groups=oscar_best_picture_winners&title_type=feature&sort=release_date,desc' ) imdb_href = 'http://www.imdb.com' for item in soup.select('.lister-item-header'): # print movieurl the_filmid = item.a.get('href').split('title/')[1].split('/')[0] if the_filmid in exist_films: cursor.execute('''UPDATE FilmDB SET Oscar = 1 WHERE imdb_filmID=%s''', (the_filmid, )) db.commit() else: print(the_filmid) moviescrawler.crawl_imdb(the_filmid, 'Oscar') db.commit()
from crawler_util import page_read from film_update import moviescrawler from db_helper.save import cursor, db def get_exist_list(): cursor.execute('SELECT imdb_filmID FROM FilmDB') exists = [x[0] for x in cursor] return exists exist_films = get_exist_list() soup = page_read.page_read_nolog( 'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6') for item in soup.select('.titleColumn'): ref = item.a.get('href').strip() film_id = ref.split('title/')[1].split('/')[0] if film_id in exist_films: cursor.execute( '''UPDATE FilmDB SET filmType = 'Top250' WHERE imdb_filmID=%s''', (film_id, )) print(film_id) db.commit() else: moviescrawler.crawl_imdb(film_id, 'Top250') db.commit()