def get_data(self, search_query): """helper method to get data from google images by scraping and parsing""" params = { "site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query } headers = { 'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \ IEMobile/7.0; LG; GW910)' } html = '' try: html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text except Exception as exc: log_exception(__name__, exc) soup = BeautifulSoup.BeautifulSoup(html, features="html.parser") results = [] for div in soup.findAll('div'): if not div.get("id") == "images": for a_link in div.findAll("a"): page = a_link.get("href") try: img = page.split("imgurl=")[-1] img = img.split("&imgrefurl=")[0] results.append(img) except Exception: pass return results
def fetch_hubble_image(image_id): hubble_api = f"http://hubblesite.org/api/v3/image/{image_id}" response = requests.get(hubble_api) best_image = response.json()['image_files'][-1] image_ext = get_ext(best_image['file_url']) image_url = best_image.get('file_url') load_image(image_url, f"{image_id}{image_ext}")
def main(): images_url = 'https://api.spacexdata.com/v3/launches/latest' response = requests.get(images_url) response.raise_for_status() image_links = response.json()['links']['flickr_images'] for link_number, link in enumerate(image_links): download_image(link, f'spacex{link_number + 1}.jpg')
def main(): parser = argparse.ArgumentParser( description='Downloads photo by collection name') parser.add_argument('collection', help='Enter collection name') args = parser.parse_args() payload = {'collection_name': args.collection} url = 'http://hubblesite.org/api/v3/images' response = requests.get(url, params=payload) image_ids = response.json() for image in image_ids: fetch_hubble_images(image['id'])
def get_top250_db(self): ''' get the top250 listing for both movies and tvshows as dict with imdbid as key uses 7 day cache to prevent overloading the server ''' results = {} for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]: html = requests.get("http://www.imdb.com/chart/%s" % listing[0], headers={'User-agent': 'Mozilla/5.0'}, timeout=20) soup = BeautifulSoup.BeautifulSoup(html.text) for table in soup.findAll('table'): if table.get("class") == "chart full-width": for td_def in table.findAll('td'): if td_def.get("class") == "titleColumn": a_link = td_def.find("a") if a_link: url = a_link["href"] imdb_id = url.split("/")[2] imdb_rank = url.split(listing[1])[1] results[imdb_id] = try_parse_int(imdb_rank) return results
def get_data(self, search_query): '''helper method to get data from google images by scraping and parsing''' params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query} headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \ IEMobile/7.0; LG; GW910)'} html = '' try: html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text except Exception as exc: log_exception(__name__, exc) soup = BeautifulSoup.BeautifulSoup(html) results = [] for div in soup.findAll('div'): if div.get("id") == "images": for a_link in div.findAll("a"): page = a_link.get("href") try: img = page.split("imgurl=")[-1] img = img.split("&imgrefurl=")[0] results.append(img) except Exception: pass return results
def get_top250_db(self): ''' get the top250 listing for both movies and tvshows as dict with imdbid as key uses 7 day cache to prevent overloading the server ''' results = {} for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]: html = requests.get( "http://www.imdb.com/chart/%s" % listing[0], headers={ 'User-agent': 'Mozilla/5.0'}, timeout=20) soup = BeautifulSoup.BeautifulSoup(html.text) for table in soup.findAll('table'): if table.get("class") == "chart full-width": for td_def in table.findAll('td'): if td_def.get("class") == "titleColumn": a_link = td_def.find("a") if a_link: url = a_link["href"] imdb_id = url.split("/")[2] imdb_rank = url.split(listing[1])[1] results[imdb_id] = try_parse_int(imdb_rank) return results
def get_ids_from_collection(collection): api_url = f"http://hubblesite.org/api/v3/images/{collection}" response = requests.get(api_url) image_ids = [image_id.get('id') for image_id in response.json()] return image_ids