Python DOM.DOM примеры, pattern.web.DOM.DOM Python примеры использования

Пример #1

0

Показать файл

    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()

Пример #2

0

Показать файл

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Download the HTML file
    url = URL(url)
    html = url.download()

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Iterate through all 250 table rows on the index page
    for movies in dom('.lister-list > tr'):
        # take the movie's href attribute and put it in href
        href = movies('td.titleColumn a')[0].attrs["href"]
        # append the href attribute to the string, but also add http://www.imdb.com/ in front of it
        movie_urls.append("http://www.imdb.com/" + href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #3

0

Показать файл

Файл: imdb-Crawling.py Проект: thomasjurriaan/datapro

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = 'http://www.imdb.com/chart/top'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    '''
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    '''

    for e in dom.by_tag("td.titleColumn"): 
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #4

0

Показать файл

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href", "")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #5

0

Показать файл

def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom('h3 a')[0].content)
    body = plaintext(dom('#contents')[0].content)
    return [title, body]

Пример #6

0

Показать файл

Файл: top40Scraper.py Проект: DaanvanderThiel/programmeerproject

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached=True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href", "")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #7

0

Показать файл

Файл: common_words_pattern.py Проект: rakeshsukla53/fuzzy-nemesis

def get_artist_docs(name):

    default_dir = basedir + name
    rap_docs = ""

    # get a list of all the files in default dir
    for f in os.listdir(default_dir):
        # go to that dir
        os.chdir(default_dir)
        # open the file
        fi = open(f, 'r')
        # print "reading " + f
        # slurp
        page = fi.read()

        # what does this do?
        dom = DOM(page)

        # we look at the page and get that the thing we want is in the .lyrics div.
        if dom and dom('.lyrics'):
            lyrics = dom('.lyrics')[0]
        else:
            continue

        p = plaintext(lyrics.content)
        rap_docs += p

    return rap_docs

Пример #8

0

Показать файл

Файл: imdb-crawler.py Проект: Xlocsinjr/DataProcessing

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    print(url)

    url_html = url.download(cashed=True)
    url_dom = DOM(url_html)

    movie_urls = []

    for movie in url_dom.by_class("titleColumn"):
        # looks for the element containing the link.
        movie_url = movie.by_tag("a")[0]

        # Gets a dictionary of the elements' attributes.
        movie_url = movie_url.attrs['href']

        # Splits the string at the '?'.
        movie_url = movie_url.split('?')

        # Forms full url and appends to the list of movie urls
        movie_url = "http://www.imdb.com" + movie_url[0]
        movie_urls.append(movie_url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #9

0

Показать файл

Файл: test-tvscraper.py Проект: zkkeser/data_processing

 def setUp(self):
     with open(BACKUP_HTML, 'r') as f:
         dom = DOM(f.read())
         # Add the header for now as the extract_tvseries function does not
         # add a header itself.
         self.rows = [['Title', 'Ranking', 'Genre', 'Actors', 'Runtime']]
         self.rows.extend(extract_tvseries(dom))

Пример #10

0

Показать файл

Файл: python_week_3.py Проект: EEmmiillee/hackerspace-onezero

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))

    #return dom

    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #11

0

Показать файл

Файл: imdb-crawler.py Проект: linzhu1/Dataprocessing

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(url.download(cached=True))
    result = "http://imdb.com"

    for films in dom.by_tag("tbody.lister-list"):
        for urls in films.by_tag("td.titleColumn"):
            for url in urls.by_tag("a"):
                content = str(url).split('"')
                #print content[1]
                result += str(content[1])
                movie_urls.append(result)
                result = "http://imdb.com"
    return movie_urls

Пример #12

0

Показать файл

Файл: scraper.py Проект: MirjaLagerwaard/ProgrammeerprojectData

def extract_incidents(dom):

    incident_list = []
    i = 0

    for incident in dom.by_tag('tr'):
        if i > 0:
            link = INCIDENT_URL + incident.by_tag('a')[0].href
            print link

            url = URL(link)
            html = url.download(timeout=100)
            dom_incident = DOM(html)

            weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')]
            weapons = ", ".join(weapons)[:-2]
            latitude = dom_incident.by_tag('p')[2].content[33:].strip()
            longitude = dom_incident.by_tag('p')[3].content[34:].strip()

            description = incident.by_tag('div')[0].content[1:].strip()
            date = incident.by_tag('td')[2].content[1:].strip()
            location = incident.by_tag('td')[3].content[1:].strip()
            violation = incident.by_tag('td')[4].content[1:].strip()
            incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')])

        i += 1

    return incident_list

Пример #13

0

Показать файл

def load_dom(url):
    r = requests.get(url)

    if r.status_code == 200:
        return DOM(r.content)

    return None

Пример #14

0

Показать файл

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

Пример #15

0

Показать файл

def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) +
              "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get('href') for a in dom('a.link')]
    return links

Пример #16

0

Показать файл

def all_lyrics(artist):
    clean = re.sub(r"\s+|'", '', artist)
    url = URL(BASE_URL + artist[0] + '/' + clean + '.html')
    dom = DOM(url.download())
    titles = [a.content for a in dom('div#listAlbum a')]
    ew_amazon = [
        abs(link.attributes.get('href', ''), base=url.redirect or url.string)
        for link in dom('div#listAlbum a')
    ]
    songlinks = [l for l in ew_amazon if 'amazon' not in l]
    lyrics = []
    for link in songlinks:
        song_url = URL(link)
        song_dom = DOM(song_url.download())
        lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content))
    zippy_lyrics = zip(titles, lyrics)
    return json.dumps(zippy_lyrics, sort_keys=True)

Пример #17

0

Показать файл

Файл: PyTube.py Проект: Ai-Sasit/Speech-recognition-ioT

 def get_dom_object(self, url_target):
     try:
         url = URL(url_target)
         dom_object = DOM(url.download(cached=True))
     except:
         print('Problem retrieving data for this url: ',
               self.target_url_str)
         self.url_query_timeout = 1
     return dom_object

Пример #18

0

Показать файл

def scrape(url):
    with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f:
        url = "http://www.top40.nl/top40/2015/week-46"
        week = url.split("/")
        week = week[-1]
        url = URL("http://www.top40.nl/top40/2015/week-46")
        dom = DOM(url.download(cached=True))
        # geeft de week
        i = 1
        # de lijst van de top 40 selecteren

        for l in dom.by_tag("ol.top40"):
            # per nummer selecteren=
            print "lijst top 40"
            for e in l.by_tag("div.clearfix"):
                muziekGegevens = ""
                #positie in de top 40
                muziekGegevens += str(i) + ","
                print i, 'positie'
                i += 1  # opletten met resetten
                # de artiest selecteren
                for artiest in e.by_class("credit"):
                    muziekGegevens += artiest.content + ","
                #positie
                for inner in e.by_tag("strong")[1:2]:
                    print inner.content, "1:2"
                    muziekGegevens += inner.content + ","
                # hoogste notering
                for inner in e.by_tag("strong")[2:3]:
                    print inner.content, "2:3"
                    muziekGegevens += inner.content + ","
                # aantal punten
                for inner in e.by_tag("strong")[3:4]:
                    print inner.content, "3:4"
                    muziekGegevens += inner.content + ","
                # jaar van het nummer
                for inner in e.by_tag("strong")[4:5]:
                    print inner.content.strip(), "4:5"
                    muziekGegevens += inner.content.strip()
                h = HTMLParser.HTMLParser()
                muziekGegevens = h.unescape(muziekGegevens)

                if not whatisthis(muziekGegevens):
                    muziekGegevens = unicode(muziekGegevens, "utf-8")
                    print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                    f.write(muziekGegevens + "\n")
                else:
                    f.write(muziekGegevens + "\n")

    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close

Пример #19

0

Показать файл

def box_office_titles():
    # download the webpage
    html = URL(BOX_OFFICE_URL).download()
    dom = DOM(html)

    # find the movie titles
    title_elements = dom(MOVIE_TITLE_TAG)
    titles = map(lambda x: x.content, title_elements)

    return titles

Пример #20

0

Показать файл

    def extract_pic_url(self):
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')
 
        for tag in tag_list[:self.image_dl_per_search]:
            tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print('error parsing', tag)

Пример #21

0

Показать файл

def main():
    '''
    Crawl the IMDB top 250 movies, save CSV with their information.

    Note:
        This function also makes backups of the HTML files in a sub-directory
        called HTML_BACKUPS (those will be used in grading).
    '''

    # Create a directory to store copies of all the relevant HTML files (those
    # will be used in testing).
    print 'Setting up backup dir if needed ...'
    create_dir(BACKUP_DIR)

    # Make backup of the IMDB top 250 movies page
    print 'Access top 250 page, making backup ...'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)
    make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)

    # extract the top 250 movies
    print 'Scraping top 250 page ...'
    url_strings = scrape_top_250(top_250_url)

    # grab all relevant information from the 250 movie web pages
    rows = []
    for i, url in enumerate(url_strings):  # Enumerate, a great Python trick!
        print 'Scraping movie %d ...' % i
        # Grab web page
        movie_html = URL(url).download(cached=True)
        # Extract relevant information for each movie
        movie_dom = DOM(movie_html)
        rows.append(scrape_movie_page(movie_dom))

        # Save one of the IMDB's movie pages (for testing)
        if i == 83:
            html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
            make_backup(html_file, movie_html)

    # Save a CSV file with the relevant information for the top 250 movies.
    print 'Saving CSV ...'
    save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)

Пример #22

0

Показать файл

    def downloadContent(self):
        if not self.isWebPage():
            raise URLError("Invalid or empty content type")
        try:
            self.content = self.url.download(timeout=1)
        except httplib.InvalidURL:
            raise URLError("Invalid URL")

        self.decodeContent()
        self.dom = DOM(self.content)

Пример #23

0

Показать файл

Файл: Python_Computational_Linguistics_Word_Inflections.py Проект: VakinduPhilliam/Python_Computation_Linguistics

def inflect(word, language="italian"):

    inflections = {}
    url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") 
    dom = DOM(URL(url).download(throttle=10, cached=True))

    pos = ""

    # Search the header that marks the start for the given language:
    # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

    e = dom("#" + language)[0].parent

    while e is not None: # e = e.next_sibling

        if e.type == "element":

            if e.tag == "hr": # Horizontal line = next language.
                break

            if e.tag == "h3": # <h3>Adjective [edit]</h3>
                pos = plaintext(e.content.lower())
                pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-"

            # Parse inflections, using regular expressions.

            s = plaintext(e.content)

            # affetto m (f affetta, m plural affetti, f plural affette)

            if s.startswith(word):

                for gender, regexp, i in (
                  ("m" , r"(" + word + r") m", 1),
                  ("f" , r"(" + word + r") f", 1),
                  ("m" , r"(" + word + r") (mf|m and f)", 1),
                  ("f" , r"(" + word + r") (mf|m and f)", 1),
                  ("m" , r"masculine:? (\S*?)(,|\))", 1),
                  ("f" , r"feminine:? (\S*?)(,|\))", 1),
                  ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3),
                  ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3),
                  ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3),
                  ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3),
                  ( "p", r"(\(|, )plural (\S*?)(,|\))", 2),
                  ( "p", r"m and f plural (\S*?)(,|\))", 1)):
                    m = re.search(regexp, s, re.I)
                    if m is not None:
                        # {"adj-m": "affetto", "adj-fp": "affette"}
                        inflections[pos + gender] = m.group(i)

            #print s

         e = e.next_sibling

    return inflections

Пример #24

0

Показать файл

    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"

Пример #25

0

Показать файл

def extract_data_ML(i):
    url = 'http://macaulaylibrary.org/audio/%s' % i
    page = URL(url).download()
    dom = DOM(page)
    description = dom('meta')[0].attr['content']
    result = [x.content for x in dom('script') if 'jwplayer(' in x.content][0]
    result = [
        x.strip() for x in result.split('\n') if x.strip().startswith('file')
    ][0]
    path_to_mp3 = result.split('"')[1]
    return {'index': i, 'desc': description, 'mp3': path_to_mp3}

Пример #26

0

Показать файл

 def create_dom_object(self):
     """ Create dom object based on element for scraping
         Take into consideration that there might be query problem.
         
     """
     try:
         url = URL(self.full_url_str)
         self.dom_object = DOM(url.download(cached=True))
     except:
         if self.__print_url_finding_error:
             print 'Problem retrieving data for this url: ', self.full_url_str
         self.url_query_timeout = 1

Пример #27

0

Показать файл

Файл: pykytdownloader.py Проект: ishegatron/pykaraoke

 def get_dom_object(self, url_target):
     try:
         session = HTMLSession()
         # get the html content
         response = session.get(url_target)
         # execute Java-script
         response.html.render(timeout=30, sleep=2)
         dom_object = DOM(response.html.html)
         return dom_object
     except:
         self.ErrorReason = 'Problem retrieving data for this url: ' + url_target + '.\nPlease check your Internet connection.'
         return None

Пример #28

0

Показать файл

    def extract_pic_url(self):
        """ extract all the raw pic url in list
 
        """
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')
        print len(tag_list)
        for tag in tag_list[:self.nb_images]:
            tar_str = re.search('imgurl=(.*)&imgrefurl',
                                tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print 'error parsing', tag

Пример #29

0

Показать файл

def conjugate(verb, language="italian"):

    url = URL("http://en.wiktionary.org/wiki/%s" % verb)
    dom = DOM(url.download(throttle=10, cached=True))
    conj = {"infinitive": verb}
    mood = None

    for table in dom("table.inflection-table"):

        # Search the header that marks the start for the given language:
        # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

        h2 = table.parent.parent

        while h2:
            h2 = h2.previous

            if getattr(h2, "tag", "") == "h2" and \
               getattr(h2("span")[0], "id", "") != language:
                continue

        for tr in table("tr"):

            for th in tr("th"):

                # <th>indicative</th>

                if th.content in MOOD:
                    mood = th.content

                # <th>present</th><td>sono</td><td>sei></td>...

                if th.content in TENSE:
                    conj[th.content,
                         mood] = [plain(td.content) for td in tr("td")]

                # <th>gerund</th><td>essendo</td>

                if th.content in PARTICIPLE:
                    conj[th.content] = plain(th.next.next.content)

            # <th>imperative</th></tr><tr><td></td><td>sii</td>...

            if mood == "imperative" and len(tr("th")) == 0:
                conj["present", mood] = [plain(td.content) for td in tr("td")]

        return conj

    return {}

Пример #30

0

Показать файл

    def extract_images_url(self):

        #Initialize Chrome Webdriver using Selenium.
        driver = webdriver.Chrome("/usr/local/bin/chromedriver")
        driver.get(self.url_search)

        #Scroll around google page.
        init_position = 0
        move_to = 200000
        not_find = False
        for scroll in range(30):
            window_scroll = "window.scrollBy(" + str(
                init_position) + "," + str(move_to) + ")"
            driver.execute_script(window_scroll)
            time.sleep(0.2)
            init_position = move_to
            move_to = move_to + 100000
            #Find the "show more results button"
            try:
                #Click the "Show more results"
                driver.find_element_by_xpath("//input[@type='button']").click()
                print("Click!")
            except:
                continue
        time.sleep(0.5)
        self.driver_source = driver.page_source

        #Retrieve the different images-url from the google page.
        dom = DOM(self.driver_source)
        tag_list = dom('a.rg_l')
        print("Total images retrieved: " + str(len(tag_list)))
        #Avoid trying to retrieve more images that the ones that google allows.
        if (self.num_images > len(tag_list)):
            self.num_images = len(tag_list)
        #Only allow a maximum number of images defined by, self.num_images.
        for tag in tag_list[:self.num_images]:
            tar_str = re.search('imgurl=(.*)&imgrefurl',
                                tag.attributes['href'])
            try:
                self.images_url_list.append(tar_str.group(1))
            except:
                print('error parsing', tag)

        #Print number of images that you were able to download
        #(always a little bit less than self.num_images, since Exceptions and permission problems to some websites.)
        print("\nTotal number of URL images: " +
              str(len(self.images_url_list)))
        #Close the Google Chrome Webdriver.
        driver.quit()

Python DOM.DOM примеры использования