def mars_news(browser): # Scrape Mars News # Visist the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) #Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html html= browser.html news_soup= SOUP(html, 'html.parser') #Add try/except for error handling try: slide_elem = news_soup.select_one("ul.item_list li.slide") #Use the parent element to find the first 'a' tag and save it news_title = slide_elem.find("div", class_='content_title').get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find("div", class_="article_teaser_body").get_text() except AttributeError: return None, None return news_title, news_p
def main(emotion): if emotion == "Sad": urlhere = "http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc" elif emotion == "Anticipation": urlhere = "https://www.imdb.com/search/title/?genres=sci-fi" elif emotion == "Fear": urlhere = "http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc" elif emotion == "Enjoyment": urlhere = "http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc" elif emotion == "Trust": urlhere = "http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc" elif emotion == "Romantic": urlhere = "https://www.imdb.com/search/title/?genres=romance" elif emotion == "Comedy": urlhere = "https://www.imdb.com/search/title/?genres=comedy" response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r"\/title\/tt+\d*\/")}) return title
def f4(): count = 0 l = [] emotion = "Anticipation" print("ANTICIPATION MOVIES HAIN") urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) for i in title: tmp = str(i).split('>') if (len(tmp) == 3): print(tmp[1][:-3]) l.append(tmp[1][:-3]) if (count > 11): break count += 1 print() print() results[ 'text'] = 'Anticipation Movies acc.to IMDB' + ':%s \n * %s \n * %s\n *%s\n *%s\n' % ( l[0], l[1], l[2], l[3], l[4]) l = []
def featured_image(browser): # Set up URL call url = 'http://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click on it browser.is_element_present_by_text('more info', wait_time = 1) more_info_elem = browser.links.find_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html= browser.html img_soup = SOUP(html, 'html.parser') #Add try/except for error handling try: # Find the relative image url img_url_rel = img_soup.select_one('figure.lede a img').get("src") except AttributeError: return None # Use the base URL to create an absolute URL img_url = f'https://www.jpl.nasa.gov{img_url_rel}' return img_url
def main(emotion): # IMDb Url for Comedy Drama genre of # movie against emotion Sad if (emotion == "Sad"): urlhere = 'https://www.imdb.com/list/ls052109630/' # IMDb Url for Action and SciFi genre of # movie against emotion Excitement. elif (emotion == "Excitement"): urlhere = 'https://www.imdb.com/search/title/?count=100&genres=action&release_date=2019,2019&title_type=feature' # IMDb Url for Musical genre of # movie against emotion Disgust elif (emotion == "Disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "Anger"): urlhere = 'https://www.imdb.com/list/ls004108030/' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == "Fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "Enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Top Rated Movies. # movie against no emotion entered. elif (emotion == ""): urlhere = 'https://www.imdb.com/chart/top?ref_=nv_mv_250' # IMDb Url for Western genre of # movie against emotion Trust elif (emotion == "Trust"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "Surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def mars_hemispheres(browser): #https://stackoverflow.com/questions/46933679/scraping-text-in-h3-and-div-tags-using-beautifulsoup-python #https://www.dataquest.io/blog/web-scraping-beautifulsoup/ #scrape images from website url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_images = [] #loop through the images 4-8 not 1-4 -- found that those are not the correct images we are looking for for item in range(4,8): image_thumbnail = browser.find_by_tag('img') image_thumbnail[item].click() #HTML object and parse through the website with SOUP html= browser.html hemis_soup = SOUP(html, 'html.parser') # Get all the items from the mars hemisphere hemis_items = hemis_soup.find('a', text = 'Sampple').get('href') hemis_title = hemis_soup.find('h2', 'title').text #store link that takes you to full website image hemisphere_images.apped({"title": hemis_title, "url": hemis_items}) browser.visit(url + ) #must return to page to iterate again browser.back()
def getYoutubeLink(seacrhText): query = urllib.quote(seacrhText) url = "https://www.youtube.com/results?search_query=" + query # response = urllib2.urlopen(url) response = HTTP.get(url) # html = response.read() html = response.text soup = SOUP(html, "lxml") videos = soup.findAll(attrs={'class': 'yt-uix-tile-link'}) # Print all video links from query # for vid in videos: # print('https://www.youtube.com' + vid['href']) embeddedLink = None firstVideo = videos[0] if "watch" in firstVideo['href']: # print('https://www.youtube.com' + firstVideo['href']) # print(' ') # str = "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/jOGCNX8rBWs?rel=0&showinfo=0\" frameborder=\"0\" allow=\"autoplay; encrypted-media\" allowfullscreen></iframe>" # prefix = "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed" key = "https://www.youtube.com/embed/" + firstVideo['href'][ 9:] + "?rel=0&showinfo=0\\" # suffix = "?rel=0&showinfo=0\" frameborder=\"0\" allow=\"autoplay; encrypted-media\" allowfullscreen></iframe>" embeddedLink = key return embeddedLink
def main(emotion): if(emotion == "Sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif(emotion == "Disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' elif(emotion == "Anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' elif(emotion == "Anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif(emotion == "Fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' elif(emotion == "Enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif(emotion == "Trust"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' elif(emotion == "Surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs = {"href" : re.compile(r'\/title\/tt+\d*\/')}) return title
def main(emotion): # IMDb Url for Drama genre of # movie against emotion Sad if (emotion == "Sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Musical genre of # movie against emotion Disgust elif (emotion == "Disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "Anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == "Anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == "Fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "Enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Western genre of # movie against emotion Trust elif (emotion == "Trust"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "Surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page urlhere = "https://www.imdb.com/list/ls009668314/" response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex print(soup.find_all("a")) title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def get_movie(emotion): details = {} # IMDb Url for Drama genre of # movie against emotion Sad if (emotion == "sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == "anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == "fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif (emotion == "enthusiastic"): urlhere = 'https://www.imdb.com/search/title/?genres=comedy&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' elif (emotion == "happy"): urlhere = 'https://www.imdb.com/list/ls060352216/,asc' # HTTP request to get the data of # the whole page response = requests.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "html.parser") # print(soup.prettify()) # Extract movie titles from the # data using regex data = soup.find_all("div", attrs={"class": "lister-item-content"}) i = 0 for d in data: results = {} results['Name'] = d.a.contents[0] results['Genre'] = d.find("span", {"class": "genre"}).contents[0] details[i] = results i = i + 1 return details
def main(emotion): # IMDb Url for Drama genre of # movie against emotion Sad if (emotion == "Sad" or emotion == "sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Musical genre of # movie against emotion Disgust elif (emotion == "Disgust" or emotion == "disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "Angry" or emotion == "angry"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == "neutral" or emotion == "neutral"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == "Fear" or emotion == "fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "Happy" or emotion == "happy"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Western genre of # # movie against emotion Trust # elif(emotion == "Trust" or emotion == "trust"): # urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "Surprise" or emotion == "surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex b = soup.find_all('h3', {'class': 'lister-item-header'}) #c = b.findAll() return b
def main(emotion): # IMDb Url for Drama genre of # movie against emotion Sad urlhere = None print(emotion) if (emotion == "sadness"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Musical genre of # movie against emotion Disgust elif (emotion == "disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == "neutral"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # # IMDb Url for Sport genre of # # movie against emotion Fear # elif (emotion == "Fear"): # urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "happy"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def scrape(buttonTitle): #Gets the value from the emotion key in the application dictionary urlhere = app_dict[buttonTitle] #Extracts raw HTTP data from site response = HTTP.get(urlhere) data = response.text #Filters out all most unnecessary information but still far from #extraction of individual movie titles soup = SOUP(data, "lxml") raw_data = soup.find_all( "a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return raw_data
def main(emotion): """ The Main function, for parsing the emotion according to IMDB recommendation. |___________________________| | Emotion | Genre | |Sad |Drama | |Anger |Family | |Anticipation |Thriller | |Disgust |Musical | |Fear |Sport | |Joy |Thriller | |Sad |Drama | |Surprise |Noir | |Trust |Western | |---------------------------| :param emotion: Name of Emotion :return: Titles of Movies """ if emotion == "Sad": linked_url = "http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc" elif emotion == "Disgust": linked_url = "http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc" elif emotion == "Anger": linked_url = "http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc" elif emotion == "Anticipation": linked_url = "http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc" elif emotion == "Fear": linked_url = "http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc" elif emotion == "Joy": linked_url = "http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc" elif emotion == "Trust": linked_url = "http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc" elif emotion == "Surprise": linked_url = "http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc" response = HTTP.get(linked_url) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r"\/title\/tt+\d*\/")}) return title
def main(emotion): # IMDb Url for Drama genre of # movie against emotion Sad if (emotion == "sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == "anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == "anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == "fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == "enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif (emotion == "enthusiastic"): urlhere = 'https://www.imdb.com/search/title/?genres=comedy&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif (emotion == "surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' elif (emotion == "happy"): urlhere = 'https://www.imdb.com/list/ls060352216/,asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def main(emotion): if(emotion == 'sad'): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter' elif(emotion == 'surprise'): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs = {"href" : re.compile(r'\/title\/tt+\d*\/')}) return title
def main(emotion): #sad=drama if (emotion == "Sad"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' #disgust=musicals elif (emotion == "Disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' #anger=family elif (emotion == "Anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' #Anticipation=thriller elif (emotion == "Anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' #Fear=sport elif (emotion == "Fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' #Enjoyment=Thriller elif (emotion == "Enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' #Trust=western elif (emotion == "Trust"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' #Surprise=film-noir elif (emotion == "Surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def main(genre): urlIMDB = "https://www.imdb.com/search/title/?genres=" + genre # HTTP request to get the data of # the whole page response = HTTP.get(urlIMDB) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def f5(): count = 0 emotion = "Fear" print("FEAR MOVIES HAIN") urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) for i in title: tmp = str(i).split('>') if (len(tmp) == 3): print(tmp[1][:-3]) if (count > 11): break count += 1 print() print()
def main(emotion): if (emotion == 2): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Musical genre of # movie against emotion Disgust elif (emotion == 4): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif (emotion == 1): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif (emotion == 5): urlhere = 'https://www.imdb.com/search/title/?genres=happy&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif (emotion == 3): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif (emotion == 6): urlhere = 'https://www.imdb.com/search/title/?genres=comedy&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def get_movie(emotion): ''' Function to web scrape from IMDb website by genre depending on user mood ''' if (emotion == "neutral"): urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif (emotion == "negative"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' elif (emotion == "positive"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def main(emotions): if emotions == "Sad": urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif emotions == "Disgust": urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' elif emotions == "Anger": urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' elif emotions == "Anticipation": urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "lxml") title = soup.find_all("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) return title
def scrapAndProcess(emotion): url = "" if (emotion == "sad"): url = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif (emotion == "disgust"): url = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' elif (emotion == "anger"): url = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' elif (emotion == "anticipation"): url = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif (emotion == "fear"): url = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' elif (emotion == "enjoyment"): url = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif (emotion == "trust"): url = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' elif (emotion == "surprise"): url = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' movies = [] try: if not url: return movies response = HTTP.get(url) data = response.text soup = SOUP(data, "lxml") flags = ["None", "X", "\n"] for movieName in soup.findAll( 'a', attrs={"href": re.compile(r'\/title\/tt+\d*\/')}): movieName = str(movieName.string) if movieName not in flags: movies.append(movieName) except Exception as e: print(e) return movies
def target(emotion): url = "" if emotion == "disgust": url = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' elif emotion == "sad": url = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif emotion == "trust": url = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' elif emotion == "anger": url = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' elif emotion == "fear": url = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' elif emotion == "anticipation" or "enjoyment": url = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif emotion == "surprise": url = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' response = HTTP.get(url) data = response.text field = SOUP(data, "lxml") #REGEX EXTRACTION OF TITLES title = field.find_all("a", attrs={"href": regex.compile(r'\/title\/tt+\d*\/')}) return title
def main(emotion): if(emotion == "Sad" or emotion == "sad"): urlhere = 'https://www.imdb.com/list/ls009576722/' elif(emotion == "Disgust" or emotion == "disgust"): urlhere = 'https://www.imdb.com/list/ls075745491/' elif(emotion == "Anger" or emotion == "anger"): urlhere = 'https://www.imdb.com/list/ls000445157/' elif(emotion == "Anticipation" or emotion == "anticipation"): urlhere = 'https://www.imdb.com/india/upcoming/' elif(emotion == "Fear" or emotion == "fear"): urlhere = 'https://www.imdb.com/list/ls058201636/' elif(emotion == "Enjoyment" or emotion == "enjoyment"): urlhere = 'https://www.imdb.com/list/ls005597767/' elif(emotion == "Trust" or emotion == "trust"): urlhere = 'https://www.imdb.com/list/ls051594496/' elif(emotion == "Surprise" or emotion == "surprise"): urlhere = 'https://www.imdb.com/list/ls008944391/' response = HTTP.get(urlhere) data = response.text soup = SOUP(data, "html.parser") title = soup.find_all("a", attrs = {"href" : re.compile(r'\/title\/tt+\d*\/')}) return title
def recommend(emotion): if emotion=="Sad": urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' elif(emotion == "Disgust"): urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' elif(emotion == "Anger"): urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' elif(emotion == "Anticipation"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif(emotion == "Fear"): urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' elif(emotion == "Enjoyment"): urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' elif(emotion == "Trust"): urlhere = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' elif(emotion == "Surprise"): urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' ##web scraping to get HTML response=HTTP.get(urlhere) info=response.text soup=SOUP(info,"lxml") ##parsing to form tree name=soup.find_all("a",attrs={"href":re.compile(r'\/title\/tt+\d*\/')}) return name
def main(emotion): em=emotion.lower() # IMDb Url for Drama genre of # movie against emotion Sad if(em == "sad"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb Url for Musical genre of # movie against emotion Disgust elif(em == "disgust"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb Url for Family genre of # movie against emotion Anger elif(em == "angry"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Anticipation elif(em == "neutral"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Sport genre of # movie against emotion Fear elif(em == "scared"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb Url for Thriller genre of # movie against emotion Enjoyment elif(em == "happy"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb Url for Film_noir genre of # movie against emotion Surprise elif(em == "surprised"): print("EMOTION DETECTED:",em) urlhere = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # HTTP request to get the data of # the whole page response = HTTP.get(urlhere) data = response.text # Parsing the data using # BeautifulSoup soup = SOUP(data, "lxml") # Extract movie titles from the # data using regex title = soup.find_all("a", attrs = {"href" : re.compile(r'\/title\/tt+\d*\/')}) title1 = soup.find_all("h3",{"class":"lister-item-header"}) print("LIST OF APT MOVIES BASED ON USERS CURRENT EMOTION:") print(title1[0].text) rating = soup.find_all("div", {"class": "inline-block ratings-imdb-rating"}) print("rating=",rating[0].text) print(title1[1].text) print("rating=",rating[1].text) print(title1[2].text) print("rating=",rating[2].text) print(title1[3].text) print("rating=",rating[3].text) print(title1[4].text) print("rating=",rating[4].text) print(title1[5].text) print("rating=",rating[5].text) print(title1[6].text) print("rating=",rating[6].text) print(title1[7].text) print("rating=",rating[7].text) print(title1[8].text) print("rating=",rating[8].text) print(title1[9].text) print("rating=",rating[9].text) return title1
def scrapAndProcess(emotion): # URL to which GET request will be made url = "" # IMDb URL for Drama genre of movie against emotion Sad if (emotion == "sad"): url = 'http://www.imdb.com/search/title?genres=drama&title_type=feature&sort=moviemeter, asc' # IMDb URL for Musical genre of movie against emotion Disgust elif (emotion == "disgust"): url = 'http://www.imdb.com/search/title?genres=musical&title_type=feature&sort=moviemeter, asc' # IMDb URL for Family genre of movie against emotion Anger elif (emotion == "anger"): url = 'http://www.imdb.com/search/title?genres=family&title_type=feature&sort=moviemeter, asc' # IMDb URL for Thriller genre of movie against emotion Anticipation elif (emotion == "anticipation"): url = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb URL for Sport genre of movie against emotion Fear elif (emotion == "fear"): url = 'http://www.imdb.com/search/title?genres=sport&title_type=feature&sort=moviemeter, asc' # IMDb URL for Thriller genre of movie against emotion Enjoyment elif (emotion == "enjoyment"): url = 'http://www.imdb.com/search/title?genres=thriller&title_type=feature&sort=moviemeter, asc' # IMDb URL for Western genre of movie against emotion Trust elif (emotion == "trust"): url = 'http://www.imdb.com/search/title?genres=western&title_type=feature&sort=moviemeter, asc' # IMDb URL for Film_noir genre of movie against emotion Surprise elif (emotion == "surprise"): url = 'http://www.imdb.com/search/title?genres=film_noir&title_type=feature&sort=moviemeter, asc' # List to store all movie names movies = [] # Try catch block to prevent abrupt termination of code if IMDb server is down try: # If entered emotion is not from one of the above, return empty movies list if not url: return movies # HTTP request to get the data of the whole page response = HTTP.get(url) # Accessing the text property of the response object data = response.text # Parsing the data using BeautifulSoup soup = SOUP(data, "lxml") # Pruning noisy data - the elements in this list can appear as movie names flags = ["None", "X", "\n"] # Extract movie titles from the data using regex for movieName in soup.findAll( 'a', attrs={"href": re.compile(r'\/title\/tt+\d*\/')}): # Converting from bs4.element.NavigableString to python3 string movieName = str(movieName.string) # Checking if movie name is not in noisy data list if movieName not in flags: movies.append(movieName) # Catch exceptions - they might occur if the IMDb server is down except Exception as e: print(e) return movies
def scrape_rt(RT, num): response = requests.get(RT) data = SOUP(response.text, 'lxml') RT_dict = {} title_lst = [] rel_lst = [] reviews_lst = [] # Rotten Tomatoes lists top 100 from each genre # as above, we hope to obtain name, grading, runtime, and rating for movie in data.findAll('tr'): # title title = movie.find("a", class_="unstyled articleLink") if title != None: cleanTitle = str(title).split('">')[1].split(" (")[0].strip( '\n').strip() RT_dict[cleanTitle] = [] title_lst.append(cleanTitle) #100 # link to movie profile rel_link = str(title).split('href="')[1].split('">\n')[0] link = "https://www.rottentomatoes.com/" + rel_link RT_dict[cleanTitle].append(link) # numbers of reviews: num_reviews = movie.find('td', class_="right hidden-xs") if num_reviews != None: num_reviews = int( str(num_reviews).split('">')[1].split('</')[0]) #100 # collect number of reviewers for later movie score adjustments reviews_lst.append(num_reviews) # rating for review, title, movie in zip( reviews_lst, title_lst, data.findAll('span', class_='tMeterIcon tiny')): rating = movie.find('span', class_="tMeterScore") rating = str(rating).split('">\xa0')[1].split('%</')[0] # transform RT rating into the same scale as IMDB rating (out of 10) weightedRating = int(rating) / 10 # score adjustments weightedRating = weightedRating * log(log(review, 4), 5) weightedRating = round(weightedRating, 1) RT_dict[title].append(weightedRating) # to increase the efficiency of the script, # we are going to rank movies based on rating # and only look up movie profiles of top-ranked movies ranked_dict = rank_movies(RT_dict) ranked_dict = dict(list(ranked_dict.items())[0:num]) for value in ranked_dict.values(): rel_lst.append(value[0]) value.pop(0) new_title_lst = list(ranked_dict.keys()) # # grading and runtime information are inside movie profile links for title, link in zip(new_title_lst, rel_lst): response = requests.get(link) data_1 = SOUP(response.text, 'lxml') #movie summary for div_tag in data_1.findAll( 'div', {'class': 'movie_synopsis clamp clamp-6 js-clamp'}): summary = str(div_tag.text).replace("\n", "") ranked_dict[title].insert(0, summary) for div_tag in data_1.findAll('li', {'class': 'meta-row clearfix'}): movie_label = div_tag.find('div', { 'class': 'meta-label subtle' }).text if movie_label == "Rating:": rating_info = div_tag.find('div', {'class': 'meta-value'}).text rating_info = rating_info.replace("\n", "").replace(" ", "") ranked_dict[title].insert(1, rating_info) elif movie_label == "Runtime:": runtime_info = div_tag.find('div', { 'class': 'meta-value' }).text runtime_info = runtime_info.replace("\n", "").replace(" ", "") ranked_dict[title].insert(2, runtime_info) return ranked_dict
def scrape_IMDB(IMDB, num, folder_path=None): folder_path = "movie_summary/" # you only need the folder_path when you need to store movie summary response = requests.get(IMDB) data = SOUP(response.text, 'lxml') # we hope to have movie's name, grading, runtime, and rating IMDB_dict = {} title_lst = [] num_reviews = [] # IMDB lists top 50 from each genre for movie in data.findAll('div', class_="lister-item-content"): # title title = movie.find("a", attrs={"href": re.compile(r'\/title\/tt+\d*\/')}) title = str(title).split('">')[1].split('</')[0] IMDB_dict[title] = [] title_lst.append(title) # movie summary summary = movie.findAll('p', {'class': 'text-muted'}) if summary != None: summary = str(summary).split( ', <p class="text-muted">')[1].replace("\n", "").replace( "</p>]", "") #clean the summary text IMDB_dict[title].append(summary) # grading grading = movie.find('span', class_="certificate") if grading != None: grading = str(grading).split('">')[1].split('</')[0] else: grading = "Not Found" IMDB_dict[title].append(grading) # runtime length = movie.find('span', class_="runtime") if length != None: length = str(length).split('">')[1].split('</')[0] else: length = "Not Found" IMDB_dict[title].append(length) # No. of reviewers for title, movie in zip(title_lst, data.findAll('p', class_="sort-num_votes-visible")): numRater = int(re.sub("[^0-9]", "", movie.text)) num_reviews.append(numRater) # rating for review, title, movie in zip(num_reviews, title_lst, data.findAll('div', class_="ratings-bar")): rating = movie.find('div', class_="inline-block ratings-imdb-rating") try: rating = float( re.search(r'[\d]*[.][\d]+', str(rating).split(' ')[3]).group()) except AttributeError: rating = float( re.search(r'\d+', str(rating).split(' ')[3]).group()) # score adjustments based on number of reviewers through logistic regression weightedRating = rating * log(log(review, 5), 10) weightedRating = round(weightedRating, 1) IMDB_dict[title].append(weightedRating) ranked_dict = rank_movies(IMDB_dict) ranked_dict = dict(list(ranked_dict.items())[0:num]) # print(ranked_dict) return ranked_dict