def full_cast(keyword, table_tag): cast = table_tag[index_finder(table_tag, keyword)].findNext('table').select('tr') cast_df = pd.DataFrame(columns=[ 'Image', 'Name', 'Name_ID', 'Name_URI', 'Character_Name', 'Character_ID', 'Character_URI' ]) for tag in cast: primary_photo = tag.select_one('td.primary_photo') character = tag.select_one('td.character') cast_df.loc[len(cast_df)] = [ catch('None', lambda: unicode(primary_photo.a.img['src'])), catch('None', lambda: unicode(primary_photo.a.img['title'])), catch('None', lambda: unicode(primary_photo.a['href'][6:15])), catch( 'None', lambda: unicode('%s%s' % (base_uri, primary_photo.a['href'][1:]))), catch('None', lambda: characters_title(character)), catch('None', lambda: characters_id(character)), catch('None', lambda: characters_uri(character)) ] cast_df = dataframe_data(cast_df) return cast_df
def critic_df(targeted_tag): df = pd.DataFrame(columns=[ 'Rating Value', 'Publisher', 'Author', 'Publisher URI', 'Summary' ]) for tag in targeted_tag: df.loc[len(df)] = [ catch( 'None', lambda: unicode( tag.select_one('span[itemprop="ratingValue"]').get_text()) ), catch( 'None', lambda: unicode( tag.select('span[itemprop="name"]')[0].get_text())), catch( 'None', lambda: unicode( tag.select('span[itemprop="name"]')[1].get_text())), catch('None', lambda: unicode(tag.a['href'])), catch( 'None', lambda: unicode( tag.select_one('div[class="summary"]').get_text())) ] df = dataframe_data(df) return df
def cast_non_credit(keyword, table_tag): cast = table_tag[index_finder(table_tag, keyword)].findNext('table').select('tr') cast_df = pd.DataFrame(columns=['Name', 'ID', 'URI']) for tag in cast: cast_df.loc[len(cast_df)] = [name(tag), titleid(tag), uri(tag)] cast_df = dataframe_data(cast_df) return cast_df
def top_250(soup): targeted_tag = soup.select_one('.lister-list').select('tr') top_rated_movies_df = pd.DataFrame(columns=[ 'Rank', 'Name', 'ID', 'URI', 'Director', 'Cast', 'Year', 'Rating', 'Votes', 'Rating_Stats', 'Poster' ]) for title in targeted_tag: title_card = title.select_one("td.titleColumn") poster_card = title.select_one('.posterColumn') rating_card = title.select_one('.ratingColumn') top_rated_movies_df.loc[len(top_rated_movies_df)] = [ catch('None', lambda: unicode(title_card.contents[0])[:-1]), catch('None', lambda: unicode(title_card.a.get_text())), catch('None', lambda: unicode(title_card.a['href'][7:-1])), catch( 'None', lambda: unicode("%s%s" % (base_uri, title_card.a['href'][1:]))), catch( 'list', lambda: [ unicode(item.replace(' (dir.)', '')) for item in title_card.a['title'].split(',') if ' (dir.)' in item ]), catch( 'list', lambda: [ unicode(item.replace(' (dir.)', '')) for item in title_card.a['title'].split(',') if ' (dir.)' not in item ]), catch( 'None', lambda: int( re.findall( r"\d+", unicode( title_card.select_one('span.secondaryInfo'). get_text()))[-1])), catch('None', lambda: float(unicode(rating_card.strong.get_text()))), catch( 'None', lambda: int( re.findall( r"\d+", unicode(rating_card.strong['title'].replace(',', '')))[ -1])), catch('None', lambda: unicode(rating_card.strong['title'])), catch('None', lambda: unicode(poster_card.select_one('img')['src'])) ] top_rated_movies_df = dataframe_data(top_rated_movies_df) return top_rated_movies_df
def external_site(keyword, soup): sites = soup.select_one(keyword).findNext('ul').select('li') sites_df = pd.DataFrame(columns=['Name', 'URI']) for item in sites: sites_df.loc[len(sites_df)] = [ catch('None', lambda: unicode(item.get_text())), catch('None', lambda: unicode('%s%s' % (base_uri, item.a['href'][1:]))) ] sites_df = dataframe_data(sites_df) return sites_df
def technical_specs(targeted_tag): technical_specs_df = pd.DataFrame(columns=['Name', 'URI']) for tag in targeted_tag: technical_specs_df.loc[len(technical_specs_df)] = [ catch('None', lambda: unicode(tag.get_text())), catch('None', lambda: unicode("%s%s" % (base_uri, tag['href'][1:]))) ] technical_specs_df = dataframe_data(technical_specs_df) return technical_specs_df
def review_df(analyser, targeted_tag): user_reviews_df = pd.DataFrame(columns=[ 'Title', 'Title_URI', 'User_Name', 'User_URI', 'User_Reviews', 'Review_Date', 'Rating', 'Rating_Scale', 'Review_Helpful', 'Out_of', 'Review_Ation', 'Warning', 'Sentiment', 'Sentiment Score', 'Polarity Scorce' ]) for item in targeted_tag: title = catch('None', lambda: item.select_one('.title')) user_name = catch('None', lambda: item.select_one('.display-name-link')) rating = catch('None', lambda: item.select_one('.rating-other-user-rating')) votes = catch( 'None', lambda: unicode( item.select_one('div.actions').contents[0].replace(',', ''))) user_review = catch( 'None', lambda: unicode( item.select_one('.text').get_text().replace("\'", ""))) analysis = catch('None', lambda: TextBlob(user_review)) user_reviews_df.loc[len(user_reviews_df)] = [ catch('None', lambda: unicode(title.get_text())), catch('None', lambda: "%s%s" % (base_uri, unicode(title['href'][1:]))), catch('None', lambda: unicode(user_name.get_text())), catch( 'None', lambda: "%s%s" % (base_uri, unicode(user_name.a['href'][1:]))), user_review, catch('None', lambda: unicode(item.select_one('.review-date').get_text())), catch('None', lambda: int(unicode(rating.span.get_text()))), catch( 'None', lambda: int( unicode( rating.select_one('span.point-scale').get_text()[1:])) ), catch('None', lambda: int(re.findall(r"\d+", votes)[0])), catch('None', lambda: int(re.findall(r"\d+", votes)[1])), votes, catch( 'None', lambda: unicode( item.select_one('.spoiler-warning').get_text())), catch('None', lambda: sentiment_textblob(analysis)), catch('None', lambda: analyser.polarity_scores(user_review)['compound']), catch('None', lambda: analyser.polarity_scores(user_review)) ] user_reviews_df = dataframe_data(user_reviews_df) return user_reviews_df
def company_data(keyword, soup): company = soup.select_one(keyword).findNext('ul').select('li') company_df = pd.DataFrame(columns=['Name', 'ID', 'URI']) for tag in company: company_df.loc[len(company_df)] = [ catch('None', lambda: unicode(tag.a.get_text())), catch('None', lambda: unicode(tag.a['href'][9:])), catch('None', lambda: unicode('%s%s' % (base_uri, tag.a['href'][1:]))) ] company_df = dataframe_data(company_df) return company_df
def top_box_office(targeted_tag, box_office, date): top_box_office_df = pd.DataFrame(columns=[ 'Name', 'ID', 'URI', 'Director', 'Cast', 'Weekend', 'Gross', 'Weeks', 'Poster', 'Start_Week', 'End_Week' ]) for title in targeted_tag: title_card = title.select_one("td.titleColumn") poster_card = title.select_one('.posterColumn') top_box_office_df.loc[len(top_box_office_df)] = [ catch('None', lambda: unicode(title_card.a.get_text())), catch('None', lambda: unicode(title_card.a['href'][7:-1])), catch( 'None', lambda: unicode("%s%s" % (base_uri, title_card.a['href'][1:]))), catch( 'list', lambda: [ unicode(item.replace(' (dir.)', '')) for item in title_card.a['title'].split(',') if ' (dir.)' in item ]), catch( 'list', lambda: [ unicode(item.replace(' (dir.)', '')) for item in title_card.a['title'].split(',') if ' (dir.)' not in item ]), catch( 'None', lambda: unicode(title.select_one('.ratingColumn').get_text())), catch( 'None', lambda: unicode( title.select_one('span.secondaryInfo').get_text())), catch( 'None', lambda: unicode( title.select_one('td.weeksColumn').get_text())), catch('None', lambda: unicode(poster_card.select_one('img')['src'])), catch( 'None', lambda: unicode("%s %s, %s" % (date[0].split()[ 0], date[0].split()[1], box_office[-4:]))), catch( 'None', lambda: unicode("%s %s, %s" % (date[0].split()[ 0], date[1], box_office[-4:]))) ] top_box_office_df = dataframe_data(top_box_office_df) return top_box_office_df
def rating_demo_region_df(targeted_tag): rating_demo_df = pd.DataFrame(columns=[ tag.text for tag in targeted_tag.findNext('table').select('tr')[0].select('th') ]) for tag in targeted_tag.findNext('table').select('tr')[1:]: demo_tag = tag.select('td[align="center"]') rating_demo_df.loc[len(rating_demo_df)] = [ catch('None', lambda: rating_demo(demo_tag[0])), catch('None', lambda: rating_demo(demo_tag[1])), catch('None', lambda: rating_demo(demo_tag[2])) ] rating_demo_df = dataframe_data(rating_demo_df) return rating_demo_df
def rating_demo_df(targeted_tag): rating_demo_df = pd.DataFrame(columns=[ tag.text for tag in targeted_tag.select('tr')[0].select('th') ]) for tag in targeted_tag.select('tr')[1:]: demo_tag = tag.select('td[align="center"]') rating_demo_df.loc[len(rating_demo_df)] = [ catch('None', lambda: tag.select_one('div[class="allText"]').text), catch('None', lambda: rating_demo(demo_tag[0])), catch('None', lambda: rating_demo(demo_tag[1])), catch('None', lambda: rating_demo(demo_tag[2])), catch('None', lambda: rating_demo(demo_tag[3])), catch('None', lambda: rating_demo(demo_tag[4])) ] rating_demo_df = dataframe_data(rating_demo_df) return rating_demo_df
def rating_df(targeted_tag): rating_df = pd.DataFrame(columns=['Rating Scale', 'Percentage', 'Votes']) for tag in targeted_tag.findPrevious('table').select('tr')[1:]: rating_df.loc[len(rating_df)] = [ catch( 'None', lambda: unicode( tag.select_one('div[class="rightAligned"]').get_text())), catch( 'None', lambda: unicode( tag.select_one('div[class="topAligned"]').get_text())), catch( 'None', lambda: unicode( tag.select_one('div[class="leftAligned"]').get_text())) ] rating_df = dataframe_data(rating_df) return rating_df
def trending_now_df(targeted_tag): movie_df = pd.DataFrame( columns=['Rank', 'Name', 'ID', 'URI', '% OF TOP 10 PAGE VIEWS']) for tag in targeted_tag: name_tag = tag.select_one('.trending-list-rank-item-name') movie_df.loc[len(movie_df)] = [ catch( 'None', lambda: unicode( tag.select_one('.trending-list-rank-item-rank-position'). get_text())), catch('None', lambda: unicode(name_tag.get_text())), catch('None', lambda: unicode(name_tag.a['href'][7:16])), catch('None', lambda: unicode("%s%s" % (base_uri, name_tag.a['href'][1:]))), catch( 'None', lambda: unicode( tag.select_one('.trending-list-rank-item-share').get_text( ))) ] movie_df = dataframe_data(movie_df) return movie_df
def __init__(self, title_id): self.title_id = title_id self.release_info_url = imdb_uris["releaseinfo"] % self.title_id soup = BeautifulSoup(get(self.release_info_url).text, 'lxml') """ :returns: table tag index """ table_tag = catch('None', lambda: soup.select('h4')) """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ returns: tags """ releases = catch( 'None', lambda: table_tag[index_finder(table_tag, 'release')]. findNext('table').select('tr')) """ returns: Release Info DataFrame if available. """ try: self.releases_df = pd.DataFrame( columns=['Country', 'URI', 'Date', 'Location']) for tag in releases: self.releases_df.loc[len(self.releases_df)] = [ catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__country-name' ).a.get_text())), catch( 'None', lambda: "%s%s" % (base_uri, unicode( tag.select_one( 'td.release-date-item__country-name').a[ 'href'][1:]))), catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__date'). get_text())), catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__attributes'). get_text())) ] self.releases_df = dataframe_data(self.releases_df) except: self.releases_df = None """ :returns: Released Countries, Dates, Location list if available. """ self.released_country_names = catch( 'list', lambda: self.releases_df.Country.tolist()) self.released_country_uri = catch( 'list', lambda: self.releases_df.URI.tolist()) self.released_dates = catch('list', lambda: self.releases_df.Date.tolist()) self.released_locations = catch( 'list', lambda: self.releases_df.Location.tolist()) """ :returns: Released Date in India if available. """ self.release_date_in_india = catch( 'None', lambda: unicode(releases[india_index_finder(releases, 'india')]. select_one('td').findNext('td').get_text())) """ returns: Also Known As DataFrame if available. """ try: aka = table_tag[index_finder( table_tag, 'also known as')].findNext('table').select('tr') self.also_known_as_df = pd.DataFrame(columns=['Country', 'Title']) for tag in aka: self.also_known_as_df.loc[len(self.also_known_as_df)] = [ catch( 'None', lambda: unicode( tag.select_one('td.aka-item__name').get_text())), catch( 'None', lambda: unicode( tag.select_one('td.aka-item__title').get_text())) ] self.also_known_as_df = dataframe_data(self.also_known_as_df) except: self.also_known_as_df = None """ :returns: Also Known As Countries, Title list if available. """ self.also_known_as_country_names = catch( 'list', lambda: self.also_known_as_df.Country.tolist()) self.also_known_as_titles = catch( 'list', lambda: self.also_known_as_df.Title.tolist()) """ :returns: Creates Meta Data from the above info. if available. """ self.imdb_release_info_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Release Info URL": self.release_info_url, "India Release Date": self.release_date_in_india, "Release Dates": { "Country": self.released_country_names, "URI": self.released_country_uri, "Date": self.released_dates, "Location": self.released_locations }, "Also Known As (AKA)": { "Country": self.also_known_as_country_names, "Title": self.also_known_as_titles } })
def __init__(self): """ returns: Country Name & Country Code """ soup = BeautifulSoup(get(imdb_uris['calendar']).text, 'lxml') countries, country_code, = [], [] country = soup.select_one('#sidebar').select('a') try: for item in country: print('%s : %s' % (country.index(item) + 1, item.text.strip())) countries.append(item.text) country_code.append(item['href'][17:19].lower()) input_name = re.findall(r"[\w']+", input('Enter serial number\t')) countries = [ countries[int(load) - 1] if int(load) != 0 else '' for load in input_name ] country_code = [ country_code[int(load) - 1] if int(load) != 0 else '' for load in input_name ] if len(country_code) == 1: self.country_name = countries[0] self.country_code = country_code[0] else: self.country_name = countries self.country_code = country_code except Exception as es: print("{0} :".format(type(es)), es) sys.exit(0) """ returns: Upcoming Release for selected regions """ self.region_url = imdb_uris['region'] % self.country_code region_soup = BeautifulSoup(get(self.region_url).text, 'lxml') try: release_dates = region_soup.select_one('#pagecontent').select('h4') self.upcoming_releases_df = pd.DataFrame( columns=['Release Date', 'Movie Title', 'ID', 'URI', 'Year']) for item in release_dates: movies = item.findNext('ul').select('a') years = item.findNext('ul').select('li') for i in zip(movies, years): self.upcoming_releases_df.loc[len( self.upcoming_releases_df)] = [ catch('None', lambda: unicode(item.get_text())), catch('None', lambda: unicode(i[0].get_text())), catch('None', lambda: unicode(i[0]['href'][7:16])), catch( 'None', lambda: "%s%s" % (base_uri, unicode(i[0]['href'][1:]))), catch( 'None', lambda: int( re.findall(r"\d+", unicode(i[1].contents[2] ))[-1])) ] self.upcoming_releases_df = dataframe_data( self.upcoming_releases_df) except: self.upcoming_releases_df = None
def __init__(self, title_id): self.title_id = title_id self.parental_guide_url = imdb_uris["parentalguide"] % self.title_id soup = BeautifulSoup(get(self.parental_guide_url).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: MPAA available. """ mpaa = catch( 'None', lambda: soup.select_one(tag_search['certificates']). select_one(tag_search['mpaa'])) mpaa_tag = catch( 'None', lambda: mpaa.select_one('td[class="ipl-zebra-list__label"]')) self.mpaa_name = catch('None', lambda: unicode(mpaa_tag.get_text())) self.mpaa_description = catch( 'None', lambda: unicode(mpaa_tag.findNext('td').get_text())) """ :returns: Certificate DataFrame if available. """ try: certificates = catch( 'None', lambda: soup.select_one(tag_search['certificates']). select_one(tag_search['certifications']).find( 'td', string='Certification').findNextSibling('td').select( 'li.ipl-inline-list__item')) self.certificates_df = pd.DataFrame(columns=['Name', 'URI']) for tag in certificates: self.certificates_df.loc[len(self.certificates_df)] = [ catch('None', lambda: unicode(tag.a.get_text())), catch( 'None', lambda: unicode("%s%s" % (base_uri, tag.a['href'][1:]))) ] self.certificates_df = dataframe_data(self.certificates_df) except: self.certificates_df = None self.certificates_name = catch( 'list', lambda: self.certificates_df.Name.tolist()) self.certificates_uri = catch( 'list', lambda: self.certificates_df.URI.tolist()) """ :returns: Adivsory Nudity status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-nudity'])) severity = catch('None', lambda: advisory.select_one(tag_search['nudity'])) self.adivsory_nudity_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_nudity_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Violence status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-violence'])) severity = catch('None', lambda: advisory.select_one(tag_search['violence'])) self.advisory_violence_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_violence_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Profanity status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-profanity'])) severity = catch('None', lambda: advisory.select_one(tag_search['profanity'])) self.advisory_profanity_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_profanity_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Alcohol status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-alcohol'])) severity = catch('None', lambda: advisory.select_one(tag_search['alcohol'])) self.advisory_alcohol_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_alcohol_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Frightening status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-frightening'])) severity = catch( 'None', lambda: advisory.select_one(tag_search['frightening'])) self.advisory_frightening_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_frightening_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Violence & Gore if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-violence"]')) self.spoiler_violence_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Alcohol, Drugs & Smoking if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-profanity"]')) self.spoiler_alcohol_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Frightening & Intense Scenes if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-frightening"]')) self.spoiler_frightening_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Creates Dict from the above info. if available. """ self.imdb_parental_guide_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Parental Guide URL": self.parental_guide_url, "MPAA Name": self.mpaa_name, "MPAA Description": self.mpaa_description, "Certificate": self.certificates_name, "Certificate URI": self.certificates_uri, "Sex & Nudity": { "Nudity Severity": self.adivsory_nudity_severity_status, "Nudity Review": self.advisory_nudity_reviews }, "Alcohol & Smoking": { "Alcohol Severity": self.advisory_alcohol_severity_status, "Alcohol Review": self.advisory_alcohol_reviews }, "Violence": { "Violence Severity": self. advisory_violence_severity_status, "Violence Review": self.advisory_violence_reviews }, "Frighten": { "Frighten Severity": self. advisory_frightening_severity_status, "Frighten Review": self.advisory_frightening_reviews }, "Profanity": { "Profanity Severity": self. advisory_profanity_severity_status, "Profanity Review": self.advisory_profanity_reviews }, "Spoiler Violence": self.spoiler_violence_reviews, "Spoiler Alcohol": self.spoiler_alcohol_reviews, "Spoiler Frighten": self.spoiler_frightening_reviews })