def imdb_import(number): """ Helper method to import large quantities of movies from IMDB as sample data. """ reset_database() imdb = Imdb(cache=True) top = imdb.top_250() movies = [] count = 0 for x in top: if count >= int(number): break m = Movie() im = imdb.get_title_by_id(x['tconst']) m.name = im.title m.year = im.year m.imdb_id = im.imdb_id m.save() movies.append(m) # adding director and actors for person in im.credits: if person.token == "directors": m.director = Person.objects.create_or_find_imdb(person) elif person.token == "cast": m.actors.add(Person.objects.create_or_find_imdb(person)) m.save() for i in range(random.randrange(3)): mc = MovieCopy() mc.movie = m mc.save() count = count+1 # imdb.get_title_images("tt0468569") # imdb.get_person_images("nm0000033") return { 'number_imported': number, 'kind': 'movies', 'movies': movies, }
def save_top_posters(cls): imdb = Imdb() top_250 = imdb.top_250() # delete all the posters first TopPoster.objects.all().delete() for movie in top_250: title = movie['title'] url = movie['image']['url'] num_votes = movie['num_votes'] TopPoster.objects.create( title=title, url=url, num_votes=num_votes ) print('Saved poster for ', title)
class IMDB(Miner): def __init__(self): self.handle = Imdb() super(IMDB, self).__init__() def top_list(self, number): pop_movies = self.handle.top_250() return pop_movies def get_movie_id(self, index): return "tt" + index # formatting to IMDB_ID def get_movie_by_id(self, movie_id): return self.handle.get_title_images(movie_id), self.handle.get_title(movie_id)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--clusters", required=True, type=int, help="Number of cluters") args = vars(parser.parse_args()) k = args["clusters"] make_output_dirs(k) if os.listdir("posters") == []: imdb = Imdb(anonymize=True) top = imdb.top_250() write_posters(top) qp_dir = "quantized_posters_" + str(k) + "/" cb_dir = "color_bars_" + str(k) + "/" if (os.listdir(qp_dir) == []) and (os.listdir(cb_dir) == []): posters = os.listdir("posters") for poster in posters: process_poster("posters/" + poster, k)
import redis from imdb import IMDb r = redis.StrictRedis(host='localhost', port=6379, db=0) pipe = r.pipeline() from imdbpie import Imdb imdb1 = Imdb() imdb1 = Imdb(anonymize=True) # to proxy requests # Creating an instance with caching enabled # Note that the cached responses expire every 2 hours or so. # The API response itself dictates the expiry time) imdb1 = Imdb(cache=True) top250 = imdb1.top_250() dict_top250 = {} for i in range(len(top250)): dict_top250[((top250[i]['tconst'].encode('utf-8'))[2:len(top250[i]['tconst'].encode('utf-8'))])] = (top250[i]['title'].encode('utf-8')) for movId, title in dict_top250.iteritems(): cast_list = [] cast_dict = {} # print movId imdb2 = IMDb() my = imdb2.get_movie(movId) # pipe.execute() for castMember in my['cast'][0:10]: # cast_dict[castMember['name'].encode('utf-8')] = castMember.getID(); cast_list.append(castMember.getID());
class Quiz: movies_type = '' imdb = '' movie = None def __init__(self, session): self.session = session self.imdb = Imdb() self.imdb = Imdb(cache=True) def set_level(self, level): pass def rand_movie(self, rand_type=None): movie_id = '' while self.movie is None: if rand_type == "pop": pop_movies = self.imdb.top_250() number = randrange(0, len(pop_movies) - 1) movie_id = pop_movies[number]['tconst'] if rand_type is None: number = str(randrange(1, 99999)) if len(number) < 7: number = '0' * (7 - len(number)) + number movie_id = "tt"+number # formatting to IMDB_ID self.movie = self.imdb.get_title_by_id(movie_id) if self.movie is not None: if len(self.movie.trailer_image_urls) < 1: self.movie = None def get_movie_photo(self): try: return choice(self.movie.trailer_image_urls) except ValueError as e: raise e def get_question(self, rand_type=None): try: self.rand_movie(rand_type) return self.get_movie_photo() except ValueError as e: raise(_("not_possible_find_movie")) def show(self, update, rand_type): chat_id = update.message.chat_id movie_img = self.get_question(rand_type) self.session.messenger.send_msg(chat_id, "CINEMONSTER", "title") self.session.messenger.send_photo(chat_id, movie_img, caption=_("question_which_movie")) self.session.update_counter() self.session.status = "running" def check_resps(self, update): chat_id = update.message.chat_id if str.lower(self.movie.title) == str.lower(update.message.text): player = Player(update.message.from_user.id) player.name = update.message.from_user.first_name+" "+update.message.from_user.last_name try: self.session.player_add(player) except ValueError as e: pass self.session.players[update.message.from_user.id].add_points(1) self.session.messenger.send_msg(chat_id, msg=(player.name, _("correct_answer")), type_msg='bold') self.movie = None def check_expiration(self): try: self.session.update_timer() except ValueError as e: pass if self.session.status == "timed_out": self.session.messenger.send_msg(chat_id=self.session.chat_id, msg=(_("times_up"), self.movie.title), type_msg='bold') self.session.status = "stop" self.movie = None
from alchemyapi import AlchemyAPI alchemyapi=AlchemyAPI() from imdbpie import Imdb imdb = Imdb() imdb = Imdb(anonymize=True) # to proxy requests # Creating an instance with caching enabled # Note that the cached responses expire every 2 hours or so. # The API response itself dictates the expiry time) imdb = Imdb(cache=True) top_mov = imdb.top_250() rating = [] title = [] id = [] votes = [] prod_year = [] for i in range(len(top_mov)): rating.append(top_mov[i]['rating']) title.append(top_mov[i]['title']) id.append(top_mov[i]['tconst']) votes.append(top_mov[i]['num_votes']) prod_year.append(top_mov[i]['year']) #print rating reviews={} reviewScore={} num = 15 for item in id[201:250]:
import psycopg2 from imdbpie import Imdb import random imdb = Imdb() imdb = Imdb(anonymize=True) variable = imdb.search_for_title("The Dark Knight")[0] # conn = psycopg2.connect() # cur = conn.cursor() title = imdb.get_title_by_id("tt0468569") print (title.title) print (title.rating) print (title.runtime) x = 0 listOfPopularMovies = imdb.top_250() while x<15: temp = random.randint(1, 249) t = listOfPopularMovies[temp] tid = t["tconst"] print (tid) print (t["title"] + " is the " + str(temp) +"th rated movie") print ("It's score is: " + str(t["rating"])) x = x + 1
import redis from imdb import IMDb r = redis.StrictRedis(host='localhost', port=6379, db=0) pipe = r.pipeline() from imdbpie import Imdb imdb1 = Imdb() imdb1 = Imdb(anonymize=True) # to proxy requests # Creating an instance with caching enabled # Note that the cached responses expire every 2 hours or so. # The API response itself dictates the expiry time) imdb1 = Imdb(cache=True) top250 = imdb1.top_250() dict_top250 = {} for i in range(len(top250)): dict_top250[((top250[i]['tconst'].encode('utf-8') )[2:len(top250[i]['tconst'].encode('utf-8'))])] = ( top250[i]['title'].encode('utf-8')) for movId, title in dict_top250.iteritems(): cast_list = [] cast_dict = {} # print movId imdb2 = IMDb() my = imdb2.get_movie(movId) # pipe.execute() for castMember in my['cast'][0:10]: # cast_dict[castMember['name'].encode('utf-8')] = castMember.getID();
return 'None' if s.find('\'') != -1: ss = s.split("\'") new = '' for x in ss: new = new + "\'" + "\'" + x return new[2:] else: return s imdb = Imdb() imdb = Imdb(anonymize=True) # to proxy requests top250 = [] top250 = imdb.top_250() for item in top250: try: title = imdb.get_title_by_id(item['tconst']) if len(title.trailers) > 0: trailer_url = title.trailers[0]['url'] else: trailer_url = 'None' new_movie = ( '''INSERT INTO movie_movie VALUES (\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\')''' .format( item['tconst'], single_quote(str(item['title'])), item['year'], title.release_date, item['rating'],
from imdbpie import Imdb import pandas as pd from halo import Halo imdb = Imdb(anonymize=True) movies = imdb.top_250() cols = [ "Title", "Actors", "Director", "Genres", "Rating", "Running Time", "Year", "Certification", "Writers" ] df = pd.DataFrame(columns=cols) spinner = Halo(text='Loading', spinner='dots') spinner.start() for j, el in enumerate(movies): movie = imdb.get_title_by_id(el["tconst"]) title = movie.title actors = ', '.join(i.name for i in movie.cast_summary) director = movie.directors_summary[0].name genres = ', '.join(i for i in movie.genres) rating = movie.rating rtime = movie.runtime year = movie.year cert = movie.certification writers = ', '.join(i.name for i in movie.writers_summary) spinner.text = "Running - " + str((j + 1) / 2.5) + "%" df.loc[j] = [
class ImdbNewly: _imdb = None _newTop = None _oldTop = None _newlyAdded = None _storedTopFile = "top.json" def __init__(self): self._imdb = Imdb() self._oldTop = self._get_stored_data() self._oldTopList = self._generate_oldTop_id_list() self._newTop = self._fetch_data() self._newTopList = self._generate_newTop_id_list() self._newlyAdded = self._find_newly_added() def _fetch_data(self): today = datetime.datetime.now() ret = { "top" : self._imdb.top_250(), "info" : { "date" : today.ctime() } } return ret def save_top_data(self): top = self._fetch_data() f = open(self._storedTopFile, 'w') f.write(json.dumps(top)) f.close() def _generate_oldTop_id_list(self): return [item["tconst"] for item in self._oldTop["top"]] def _generate_newTop_id_list(self): return [item["tconst"] for item in self._newTop["top"]] def _get_stored_data(self): if not os.path.isfile(self._storedTopFile): self.save_top_data() f = open(self._storedTopFile, 'r') jsonTop = f.read() f.close() return json.loads(jsonTop) def _search_newTop_data(self, id): return next((item for item in self._newTop["top"] if item["tconst"] == id), None) def _find_newly_added(self): return set(self._newTopList) - set(self._oldTopList) def get_newly_added(self): return [self._search_newTop_data(itemId) for itemId in self._newlyAdded] def get_newTop_date(self): return self._newTop["info"]["date"] def get_oldTop_date(self): return self._oldTop["info"]["date"]
import psycopg2 from sqlalchemy import create_engine import requests from imdbpie import Imdb import nltk import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns ######################################### # part 1 # importing top 250 movies from imdb database using api thang into a dataframe imdb = Imdb() imdb = Imdb(anonymize=True) top_250 = pd.DataFrame(imdb.top_250()) # sorting values by rating and selecting only the top 100 movies top_250 = top_250.sort_values(by='rating', ascending=False) top_100 = top_250[0:100] # limiting columns according to starter code mask = ['num_votes', 'rating', 'tconst', 'title', 'year'] top_100 = top_100[mask] # getting genre/runtime from OMDB top_100 movie_list = top_100['tconst'] def get_genre_runtime(b): genres = []
import urllib from bs4 import BeautifulSoup import nltk import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline #1. Connect to the imdbpie API imdb = Imdb() imdb = Imdb(anonymize = True) #2. Query the top 250 rated movies in the database imdb.top_250() #3. Put the information into a dataframe, then keep only relevant columns data = pd.DataFrame(imdb.top_250()) data.head() data.drop('can_rate', axis=1, inplace=True) data.drop('image', axis=1, inplace=True) data.drop('type', axis=1, inplace=True) #4. Select only the top 100 movies data = data.iloc[0:100] #change the column name tconst to movie_id data.rename(columns={'tconst': 'movie_id'}, inplace=True) data.head()
from alchemyapi import AlchemyAPI alchemyapi = AlchemyAPI() from imdbpie import Imdb imdb = Imdb() imdb = Imdb(anonymize=True) # to proxy requests # Creating an instance with caching enabled # Note that the cached responses expire every 2 hours or so. # The API response itself dictates the expiry time) imdb = Imdb(cache=True) top_mov = imdb.top_250() rating = [] title = [] id = [] votes = [] prod_year = [] for i in range(len(top_mov)): rating.append(top_mov[i]['rating']) title.append(top_mov[i]['title']) id.append(top_mov[i]['tconst']) votes.append(top_mov[i]['num_votes']) prod_year.append(top_mov[i]['year']) #print rating reviews = {} reviewScore = {} num = 15 for item in id[201:250]: reviews[item] = []
return 'None' if s.find('\'') != -1: ss = s.split("\'") new = '' for x in ss: new = new + "\'" + "\'" + x return new[2:] else: return s imdb = Imdb() imdb = Imdb(anonymize=True) # to proxy requests top250 = [] top250 = imdb.top_250() for item in top250: try: title = imdb.get_title_by_id(item['tconst']) if len(title.trailers) > 0: trailer_url = title.trailers[0]['url'] else: trailer_url = 'None' new_movie = ( '''INSERT INTO movie_movie VALUES (\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\')'''.format( item['tconst'], single_quote(str(item['title'])), item['year'], title.release_date, item['rating'], single_quote(item['image']['url']),
# - Name - first and last name # - Known for - url to a movie that they are best known for # - Birth date (month/year/day) # # ##### Additional movie data to be retrieved with the API # Using the "tconst" field returned in the Topp 250 list to retrieve data on individual movies. Data fields that will be retrieved for each movie include: # * # # Each dataset will be initially loaded into Pandas dataframes and then saved as Postgres tables for analysis # ##### Load the Top 250 Movies of all time into dataframe 'top_250' and drop unwanted columns # In[ ]: imdb = Imdb() imdb_top = imdb.top_250() #imdb.search_for_title("The Dark Knight") imdb_top top_250 = pd.DataFrame(imdb_top, columns=['can_rate', 'image', 'num_votes', 'rating', 'tconst', 'title', 'type', 'year']) top_250.drop(['can_rate', 'image', 'title', 'type'],inplace=True,axis=1) # ##### Import the Top 100 Actors and drop unwanted columns # In[ ]: top_actors = pd.read_csv("top_100_actors.csv") top_actors.drop(['created', 'modified'],inplace=True,axis=1) # ##### Pull selected movie information and add columns to top_250 dataframe
from imdbpie import Imdb imdb = Imdb() imdb = Imdb(anonymize=True) var1 = imdb.top_250() print(var1)
etree.SubElement(movie_xml, "genre").text = str(movie.genres[0]) etree.SubElement(movie_xml, "title").text = movie.title etree.SubElement(movie_xml, "year").text = str(movie.year) etree.SubElement(movie_xml, "description").text = movie.plot_outline etree.SubElement(movie_xml, "price").text = str(random.randint(1, 8)) return movie_xml if len(sys.argv) < 2: print 'usage: imdb_downloader.py [movie name | top50 | top250]' sys.exit(1) if sys.argv[1] == "top50": print "Retrieving Top 50 movies" top50 = imdb.top_250()[0:50] for m in top50: movies_xml.getroot().append(get_movie(m['tconst'])) elif sys.argv[1] == "top250": print "Retrieving Top 250 movies" top50 = imdb.top_250() for m in top50: movies_xml.getroot().append(get_movie(m['tconst'])) else: movie = imdb.find_by_title(sys.argv[1])[0] if movie is None: