def get_images_from_source(soup, url): sources = ['src', 'srcset', 'data-src'] images = [] img_tags = soup.find_all('img') if url: site = get_host_name(url) prot = url.split(':')[0] urls = [] for img in img_tags: for src in sources: try: urls.append(img[src]) except KeyError: pass for u in urls: u = u.split('?')[0] filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) if filename: if (('http' not in u) and (url)): # sometimes an image source can be relative # if it is provide the base url u = '{}://{}{}'.format(prot, site, u) if 'http' in u: images.append(u) return images
def main(): if len(sys.argv) < 3: print("Usage: generate.py ScraperClassName url") exit(1) class_name = sys.argv[1] url = sys.argv[2] host_name = get_host_name(url) testhtml = requests.get(url, headers=HEADERS).content generate_scraper(class_name, host_name) generate_scraper_test(class_name, host_name) generate_test_data(class_name, testhtml) init_scraper(class_name)