示例#1
0
def get_images_from_source(soup, url):
    sources = ['src', 'srcset', 'data-src']
    images = []
    img_tags = soup.find_all('img')
    if url:
        site = get_host_name(url)
        prot = url.split(':')[0]

    urls = []
    for img in img_tags:
        for src in sources:
            try:
                urls.append(img[src])
            except KeyError:
                pass

    for u in urls:
        u = u.split('?')[0]
        filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
        if filename:
            if (('http' not in u) and (url)):
                # sometimes an image source can be relative
                # if it is provide the base url
                u = '{}://{}{}'.format(prot, site, u)
            if 'http' in u:
                images.append(u)
    return images
示例#2
0
def main():
    if len(sys.argv) < 3:
        print("Usage: generate.py ScraperClassName url")
        exit(1)

    class_name = sys.argv[1]
    url = sys.argv[2]
    host_name = get_host_name(url)
    testhtml = requests.get(url, headers=HEADERS).content

    generate_scraper(class_name, host_name)
    generate_scraper_test(class_name, host_name)
    generate_test_data(class_name, testhtml)
    init_scraper(class_name)