Пример #1
0
 def get_data(self, search_query):
     """helper method to get data from google images by scraping and parsing"""
     params = {
         "site": "imghp",
         "tbm": "isch",
         "tbs": "isz:l",
         "q": search_query
     }
     headers = {
         'User-agent':
         'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
         IEMobile/7.0; LG; GW910)'
     }
     html = ''
     try:
         html = requests.get('https://www.google.com/search',
                             headers=headers,
                             params=params,
                             timeout=5).text
     except Exception as exc:
         log_exception(__name__, exc)
     soup = BeautifulSoup.BeautifulSoup(html, features="html.parser")
     results = []
     for div in soup.findAll('div'):
         if not div.get("id") == "images":
             for a_link in div.findAll("a"):
                 page = a_link.get("href")
                 try:
                     img = page.split("imgurl=")[-1]
                     img = img.split("&imgrefurl=")[0]
                     results.append(img)
                 except Exception:
                     pass
     return results
Пример #2
0
def fetch_hubble_image(image_id):
    hubble_api = f"http://hubblesite.org/api/v3/image/{image_id}"
    response = requests.get(hubble_api)
    best_image = response.json()['image_files'][-1]
    image_ext = get_ext(best_image['file_url'])
    image_url = best_image.get('file_url')
    load_image(image_url, f"{image_id}{image_ext}")
Пример #3
0
def main():
    images_url = 'https://api.spacexdata.com/v3/launches/latest'
    response = requests.get(images_url)
    response.raise_for_status()
    image_links = response.json()['links']['flickr_images']
    for link_number, link in enumerate(image_links):
        download_image(link, f'spacex{link_number + 1}.jpg')
def main():
    parser = argparse.ArgumentParser(
        description='Downloads photo by collection name')
    parser.add_argument('collection', help='Enter collection name')
    args = parser.parse_args()
    payload = {'collection_name': args.collection}
    url = 'http://hubblesite.org/api/v3/images'
    response = requests.get(url, params=payload)
    image_ids = response.json()
    for image in image_ids:
        fetch_hubble_images(image['id'])
Пример #5
0
 def get_top250_db(self):
     '''
         get the top250 listing for both movies and tvshows as dict with imdbid as key
         uses 7 day cache to prevent overloading the server
     '''
     results = {}
     for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]:
         html = requests.get("http://www.imdb.com/chart/%s" % listing[0],
                             headers={'User-agent': 'Mozilla/5.0'},
                             timeout=20)
         soup = BeautifulSoup.BeautifulSoup(html.text)
         for table in soup.findAll('table'):
             if table.get("class") == "chart full-width":
                 for td_def in table.findAll('td'):
                     if td_def.get("class") == "titleColumn":
                         a_link = td_def.find("a")
                         if a_link:
                             url = a_link["href"]
                             imdb_id = url.split("/")[2]
                             imdb_rank = url.split(listing[1])[1]
                             results[imdb_id] = try_parse_int(imdb_rank)
     return results
Пример #6
0
 def get_data(self, search_query):
     '''helper method to get data from google images by scraping and parsing'''
     params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query}
     headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
         IEMobile/7.0; LG; GW910)'}
     html = ''
     try:
         html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text
     except Exception as exc:
         log_exception(__name__, exc)
     soup = BeautifulSoup.BeautifulSoup(html)
     results = []
     for div in soup.findAll('div'):
         if div.get("id") == "images":
             for a_link in div.findAll("a"):
                 page = a_link.get("href")
                 try:
                     img = page.split("imgurl=")[-1]
                     img = img.split("&imgrefurl=")[0]
                     results.append(img)
                 except Exception:
                     pass
     return results
Пример #7
0
 def get_top250_db(self):
     '''
         get the top250 listing for both movies and tvshows as dict with imdbid as key
         uses 7 day cache to prevent overloading the server
     '''
     results = {}
     for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]:
         html = requests.get(
             "http://www.imdb.com/chart/%s" %
             listing[0], headers={
                 'User-agent': 'Mozilla/5.0'}, timeout=20)
         soup = BeautifulSoup.BeautifulSoup(html.text)
         for table in soup.findAll('table'):
             if table.get("class") == "chart full-width":
                 for td_def in table.findAll('td'):
                     if td_def.get("class") == "titleColumn":
                         a_link = td_def.find("a")
                         if a_link:
                             url = a_link["href"]
                             imdb_id = url.split("/")[2]
                             imdb_rank = url.split(listing[1])[1]
                             results[imdb_id] = try_parse_int(imdb_rank)
     return results
Пример #8
0
def get_ids_from_collection(collection):
    api_url = f"http://hubblesite.org/api/v3/images/{collection}"
    response = requests.get(api_url)
    image_ids = [image_id.get('id') for image_id in response.json()]
    return image_ids