from film_update import moviescrawler
from crawler_util import page_read

the_url = 'http://www.imdb.com/search/title?count=100' \
          '&release_date=1915-01-01,2017-06-30&title_type=feature&page='
imdb_href = 'http://www.imdb.com'
for i in range(1, 100):
    target = the_url + str(i)
    soup = page_read.page_read_power(target)
    if soup:
        for item in soup.select('.lister-item-header'):
            movieurl = imdb_href + item.a.get('href')
            print(str(i) + ': ' + movieurl)
            the_filmid = item.a.get('href').split('title/')[1].split('/')[0]
            if the_filmid:
                moviescrawler.crawl_imdb(the_filmid, 'Normal')
Exemplo n.º 2
0
              Oscar BOOL,
              budget INT,
              gross INT,
              worldwideGross INT,
              linear_predict INT,
              linear_test DOUBLE,
              lasso_predict INT,
              lasso_test DOUBLE,
              knn_predict INT,
              knn_test DOUBLE,
              poly_predict INT,
              poly_test DOUBLE,
              UNIQUE(imdb_filmID) 
              )DEFAULT CHARSET = utf8'''

cursor.execute(create_film)
cursor.execute('DELETE FROM UpdateFilm')
db.commit()
db.close()

imdb_href = 'http://www.imdb.com/'
soup = page_read.page_read_power(imdb_href)

film_type = ('ThisWeek', 'Latest', 'Coming')

for i in range(0, 3):
    for item in soup.select('.aux-content-widget-2')[i].select('.title'):
        film_id = item.a['href'].split('title/')[1].split('?')[0]

        moviescrawler.crawl_imdb(film_id, film_type[i], need_update=True)
from crawler_util import page_read
from film_update import moviescrawler
from db_helper.save import cursor, db

i = 0
soup = page_read.page_read_nolog(
    'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6')
for item in soup.select('.titleColumn'):
    ref = item.a.get('href').strip()
    film_id = ref.split('title/')[1].split('/')[0]
    moviescrawler.crawl_imdb(film_id, 'Top250', 'False')
db.commit()
from film_update import moviescrawler
from db_helper.save import cursor, db


def get_exist_list():
    exist_tup = cursor.fetchall()
    exists = list()
    for a_item in exist_tup:
        exists.append(a_item[0])
    return exists


cursor.execute('SELECT imdb_filmID FROM FilmDB')
exist_films = get_exist_list()
soup = page_read.page_read_nolog(
    'http://www.imdb.com/search/title?count=100&'
    'groups=oscar_best_picture_winners&title_type=feature&sort=release_date,desc'
)
imdb_href = 'http://www.imdb.com'
for item in soup.select('.lister-item-header'):
    # print  movieurl
    the_filmid = item.a.get('href').split('title/')[1].split('/')[0]
    if the_filmid in exist_films:
        cursor.execute('''UPDATE FilmDB SET Oscar = 1 WHERE imdb_filmID=%s''',
                       (the_filmid, ))
        db.commit()
    else:
        print(the_filmid)
        moviescrawler.crawl_imdb(the_filmid, 'Oscar')
db.commit()
from crawler_util import page_read
from film_update import moviescrawler
from db_helper.save import cursor, db


def get_exist_list():
    cursor.execute('SELECT imdb_filmID FROM FilmDB')
    exists = [x[0] for x in cursor]
    return exists


exist_films = get_exist_list()
soup = page_read.page_read_nolog(
    'http://www.imdb.com/chart/top/?ref_=nv_mv_250_6')
for item in soup.select('.titleColumn'):
    ref = item.a.get('href').strip()
    film_id = ref.split('title/')[1].split('/')[0]

    if film_id in exist_films:
        cursor.execute(
            '''UPDATE FilmDB SET filmType = 'Top250' WHERE imdb_filmID=%s''',
            (film_id, ))
        print(film_id)
        db.commit()
    else:
        moviescrawler.crawl_imdb(film_id, 'Top250')
db.commit()