def imagevenue(url, name, dest, delim, digits, number): print "Downloading images from [imagevenue]...\n" links = get_page_links(url, lambda x: "imagevenue.com" in x) regex_base_url = re.compile(r'.*imagevenue.com', re.IGNORECASE) regex_ext = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE) for link in links: try: # source image (i.e. "Open image in a new tab") img = get_elements(link, "img#thepic") base_url_match = regex_base_url.search(link) if base_url_match and img is not []: # image name and filetype img_url = img[0]['src'] ext = regex_ext.search(img_url).group(0) # image URL and output filename new_name = set_name(name, ext, delim, number, digits) image_url = "{0}/{1}".format(base_url_match.group(0), img_url) # download download_file(image_url, new_name, dest, number) number += 1 except: pass
def hotflick(url, name, dest, delim, digits, number): print "Downloading images from [hotflick]...\n" # get all page links if the gallery has more than one page div = get_html(url).find('div', {"class": "box-paging"}) gallery_page_links = [str(tag['href']) for tag in div.findAll('a', href=True)] # get image links if gallery_page_links != []: links = [] for page in gallery_page_links: links.extend([link for link in get_page_links( "http://hotflick.net/" + page) if "/v/?q=" in link]) else: links = [link for link in get_page_links(url) if "/v/?q=" in link] regex = re.compile(r'\.net/\w/v/\?q\=(\d+)\.(.*)(\.\w*)$', re.IGNORECASE) for link in links: try: # image name and filetype match = regex.search(link) ext = match.group(3) # image URL and output filename new_name = set_name(name, ext, delim, number, digits) image_url = "http://www.hotflick.net/u/n/{0}/{1}{2}".format( match.group(1), match.group(2), ext) # download download_file(image_url, new_name, dest, number) number += 1 except: print "exception" pass
def imagebam(url, name, dest, delim, digits, number): print "Downloading images from [imagebam]...\n" # gallery page numbers (ascending) page_count = [int(el.contents[0]) for el in get_elements(url, "a.pagination_link")] if page_count: # multi-page gallery links = get_imagebam_htmlcode_links(url, page_count[-1]) else: # single-page gallery links = get_page_links(url, lambda x: "imagebam.com" in x) # remove any duplicate links links = list(unique_everseen(links)) regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE) for link in links: try: # source image (i.e. "Open image in a new tab") src = [el['src'] for el in get_elements(link, 'img') if 'id' in el.attrs] if len(src) > 0: # image URL image_url = src[0] # filetype ext = regex.search(image_url) if ext is None: ext = ".jpg" else: ext = ext.group(0) # output filename new_name = set_name(name, ext, delim, number, digits) # download download_file(image_url, new_name, dest, number) number += 1 except: pass
# get main page soup main_page_soup = utils.get_soup(url_dict[category]) # get number of pages page_count = utils.get_page_count(main_page_soup)[1] # iterate through pages and scrape links links = [] for page in range(1, page_count): # get links from page print(url_dict[category] + "?page-" + str(page)) soup = utils.get_soup(url_dict[category], "?page-" + str(page)) links = links + utils.get_page_links(soup) print(links) # save links to txt for later utils.save_data_to_txt(category, link_name_str, links) # get list of propertyIDs that have been parsed # iterate through links and generate soup for link in links: #get ID from link prop_id = link.split("-")[-1] # if id not in scraped list for category then get data if prop_id not in scraped_link_list: