Python get_page_links示例，utils.get_page_links Python示例

示例#1

0

显示文件

def imagevenue(url, name, dest, delim, digits, number):
    print "Downloading images from [imagevenue]...\n"

    links = get_page_links(url, lambda x: "imagevenue.com" in x)

    regex_base_url = re.compile(r'.*imagevenue.com', re.IGNORECASE)
    regex_ext = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            img = get_elements(link, "img#thepic")

            base_url_match = regex_base_url.search(link)
            if base_url_match and img is not []:
                # image name and filetype
                img_url = img[0]['src']
                ext = regex_ext.search(img_url).group(0)

                # image URL and output filename
                new_name = set_name(name, ext, delim, number, digits)
                image_url = "{0}/{1}".format(base_url_match.group(0), img_url)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass

示例#2

0

显示文件

文件： download.py 项目： appastair/image-dl

def imagevenue(url, name, dest, delim, digits, number):
    print "Downloading images from [imagevenue]...\n"

    links = get_page_links(url, lambda x: "imagevenue.com" in x)

    regex_base_url = re.compile(r'.*imagevenue.com', re.IGNORECASE)
    regex_ext = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            img = get_elements(link, "img#thepic")

            base_url_match = regex_base_url.search(link)
            if base_url_match and img is not []:
                # image name and filetype
                img_url = img[0]['src']
                ext = regex_ext.search(img_url).group(0)

                # image URL and output filename
                new_name = set_name(name, ext, delim, number, digits)
                image_url = "{0}/{1}".format(base_url_match.group(0), img_url)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass

示例#3

0

显示文件

文件： download.py 项目： appastair/image-dl

def hotflick(url, name, dest, delim, digits, number):
    print "Downloading images from [hotflick]...\n"
    
    # get all page links if the gallery has more than one page
    div = get_html(url).find('div', {"class": "box-paging"})
    gallery_page_links = [str(tag['href'])
                          for tag in div.findAll('a', href=True)]

    # get image links
    if gallery_page_links != []:
        links = []
        for page in gallery_page_links:
            links.extend([link for link in get_page_links(
                "http://hotflick.net/" + page) if "/v/?q=" in link])
    else:
        links = [link for link in get_page_links(url) if "/v/?q=" in link]

    regex = re.compile(r'\.net/\w/v/\?q\=(\d+)\.(.*)(\.\w*)$', re.IGNORECASE)

    for link in links:
        try:
            # image name and filetype
            match = regex.search(link)
            ext = match.group(3)

            # image URL and output filename
            new_name = set_name(name, ext, delim, number, digits)
            image_url = "http://www.hotflick.net/u/n/{0}/{1}{2}".format(
                match.group(1), match.group(2), ext)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            print "exception"
            pass

示例#4

0

显示文件

def imagebam(url, name, dest, delim, digits, number):
    print "Downloading images from [imagebam]...\n"

    # gallery page numbers (ascending)
    page_count = [int(el.contents[0])
                  for el in get_elements(url, "a.pagination_link")]

    if page_count:
        # multi-page gallery
        links = get_imagebam_htmlcode_links(url, page_count[-1])
    else:
        # single-page gallery
        links = get_page_links(url, lambda x: "imagebam.com" in x)

    # remove any duplicate links
    links = list(unique_everseen(links))

    regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            src = [el['src']
                   for el in get_elements(link, 'img')
                   if 'id' in el.attrs]
            if len(src) > 0:
                # image URL
                image_url = src[0]

                # filetype
                ext = regex.search(image_url)
                if ext is None:
                    ext = ".jpg"
                else:
                    ext = ext.group(0)

                # output filename
                new_name = set_name(name, ext, delim, number, digits)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass

示例#5

0

显示文件

文件： download.py 项目： appastair/image-dl

def imagebam(url, name, dest, delim, digits, number):
    print "Downloading images from [imagebam]...\n"

    # gallery page numbers (ascending)
    page_count = [int(el.contents[0])
                  for el in get_elements(url, "a.pagination_link")]

    if page_count:
        # multi-page gallery
        links = get_imagebam_htmlcode_links(url, page_count[-1])
    else:
        # single-page gallery
        links = get_page_links(url, lambda x: "imagebam.com" in x)

    # remove any duplicate links
    links = list(unique_everseen(links))

    regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            src = [el['src']
                   for el in get_elements(link, 'img')
                   if 'id' in el.attrs]
            if len(src) > 0:
                # image URL
                image_url = src[0]

                # filetype
                ext = regex.search(image_url)
                if ext is None:
                    ext = ".jpg"
                else:
                    ext = ext.group(0)

                # output filename
                new_name = set_name(name, ext, delim, number, digits)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass

示例#6

0

显示文件

        # get main page soup
        main_page_soup = utils.get_soup(url_dict[category])

        # get number of pages
        page_count = utils.get_page_count(main_page_soup)[1]

        # iterate through pages and scrape links
        links = []

        for page in range(1, page_count):

            # get links from page
            print(url_dict[category] + "?page-" + str(page))

            soup = utils.get_soup(url_dict[category], "?page-" + str(page))
            links = links + utils.get_page_links(soup)

            print(links)
        # save links to txt for later
        utils.save_data_to_txt(category, link_name_str, links)

        # get list of propertyIDs that have been parsed

        # iterate through links and generate soup
        for link in links:

            #get ID from link
            prop_id = link.split("-")[-1]

            # if id not in scraped list for category then get data
            if prop_id not in scraped_link_list: