示例#1
0
 def test_abs(self):
     # Assert absolute URL (special attention for anchors).
     for a, b in (("../page.html", "http://domain.com/path/"),
                  ("page.html", "http://domain.com/home.html")):
         v = web.abs(a, base=b)
         self.assertEqual(v, "http://domain.com/page.html")
     for a, b, c in (("#anchor", "http://domain.com",
                      "/"), ("#anchor", "http://domain.com/", ""),
                     ("#anchor", "http://domain.com/page", "")):
         v = web.abs(a, base=b)
         self.assertEqual(v, b + c + a)  # http://domain.com/#anchor
     print "pattern.web.abs()"
示例#2
0
 def test_abs(self):
     # Assert absolute URL (special attention for anchors).
     for a, b in (("../page.html", "http://domain.com/path/"), ("page.html", "http://domain.com/home.html")):
         v = web.abs(a, base=b)
         self.assertEqual(v, "http://domain.com/page.html")
     for a, b, c in (
         ("#anchor", "http://domain.com", "/"),
         ("#anchor", "http://domain.com/", ""),
         ("#anchor", "http://domain.com/page", ""),
     ):
         v = web.abs(a, base=b)
         self.assertEqual(v, b + c + a)  # http://domain.com/#anchor
     print "pattern.web.abs()"
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    # list for absolut URLS to each movie
    movie_urls = []

    # base-url of the main page
    base_url = URL("http://www.imdb.com/")

    # obtaining the urls
    for element in DOM(url.download()).by_tag("td.titleColumn"):
        for link in element.by_tag("a"):
            link = link.attrs.get("href","")
            link = abs(link, base=base_url.redirect or base_url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#4
0
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    absurl = URL("http://www.imdb.com")
    top_250_html = url.download(cached=True)
    links_dom = DOM(top_250_html)
    for content in links_dom("td.titleColumn"):
        for link in content("a"):
            partlink = abs(link.attributes.get("href", ""), base=absurl.redirect or absurl.string)
            movie_urls.append(partlink)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached=True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href", "")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#6
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    dom = DOM(url.download())
    from pattern.web import abs
    url = URL("http://imdb.com")
    for x in dom.by_tag("td.titleColumn"):
        x = x.by_tag("a")[0]
        x = x.attrs.get("href","")
        x = abs(x, base=url.redirect or url.string)
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    from pattern.web import abs

    movie_urls = []
    html = url.download(cached=True)
    dom = DOM(html)

    for a in dom.by_tag("tbody.lister-list"):
        for b in a.by_tag("td.titleColumn"):
            for c in b.by_tag("a"):
                link = c.attrs.get("href","")
                link = abs(link, base=url.redirect or url.string)
                movie_urls.append(link)

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#8
0
 def getImages(self):
     images = []
     for image in self.dom('img'):
         images.append(
             abs(image.attributes.get('src', ''),
                 base=self.url.redirect or self.url.string))
     return images
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.
    Args:
        url: pattern.web.URL instance pointing to the top 250 index page
    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))
    
    # return the dom value
    
    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )
            
    # return url list
    return movie_urls
示例#10
0
    def convert_to_abs(self, link):
        """Converts a relative URL to an absolute url.

        e.g. '/biz/hyatt-#hrid:123' --> 'http://www.hipadvisor.com/biz/hyatt-#hrid123'
        """
        base_url = URL(self.url)
        return abs(link, base=base_url.string)
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))

    #return dom

    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#12
0
 def getScripts(self):
     scripts = []
     for script in self.dom('script'):
         src = script.attributes.get('src', '')
         if (src):
             scripts.append(
                 abs(src, base=self.url.redirect or self.url.string))
         else:
             scripts.append(str(script))
     return scripts
def scrape_heat_urls(dom):
    days = []
    for i in range(1,3):
        temp = dom.by_tag("table")[i]
        day = temp.by_tag("th")[1].content
        rows = temp.by_tag("tr")[1:]
        heat_urls = []
        for row in rows:
            partial_url = row.by_tag("a")[0]
            heat_url = abs(partial_url.attributes.get('href',''), base=url.redirect or url.string)
            heat_urls.append(heat_url)
        days.append(heat_urls)
    return days
示例#14
0
    def getLinks(self):
        if self.content is None:
            return self.links

        if len(self.links) == 0:
            links = [
                abs(x.url, base=self.url.redirect or self.url.string)
                for x in HTMLLinkParser().parse(self.content,
                                                url=self.url.string)
            ]
            self.links = [
                WebPage(x, self, depth=self.depth + 1) for x in links
            ]
        return self.links
示例#15
0
def all_lyrics(artist):
    clean = re.sub(r"\s+|'", '', artist)
    url = URL(BASE_URL + artist[0] + '/' + clean + '.html')
    dom = DOM(url.download())
    titles = [a.content for a in dom('div#listAlbum a')]
    ew_amazon = [
        abs(link.attributes.get('href', ''), base=url.redirect or url.string)
        for link in dom('div#listAlbum a')
    ]
    songlinks = [l for l in ew_amazon if 'amazon' not in l]
    lyrics = []
    for link in songlinks:
        song_url = URL(link)
        song_dom = DOM(song_url.download())
        lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content))
    zippy_lyrics = zip(titles, lyrics)
    return json.dumps(zippy_lyrics, sort_keys=True)
示例#16
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    dom = DOM(url.download())
    # return the list of URLs of each movie's page on IMDB
    return [abs(x.attr['href'], base=url.string) for x in dom('.lister-list > tr > td.titleColumn > a')]
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """
    movie_urls = []
    dom = DOM(url.download(cached=True))

    allurls = dom.get_elements_by_classname("titleColumn")
    for oneurl in allurls:
        link = abs(oneurl[1].attrs.get("href", ""), base=url.redirect or url.string)
        movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    dom = DOM(url.download(cached=True))
    movie_urls = []

    # iterate over movies in top 250 IMDB list
    for movie in dom('td.titleColumn'):
        # adds absolute URL of movie to movie_urls array
        for link in movie.by_tag('a'):
            movie_urls.append(abs(link.attributes.get('href',''), base=url.redirect or url.string))

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#19
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.
    Args:
        url: pattern.web.URL instance pointing to the top 250 index page
    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    for link in DOM(url.download()).by_tag('td.titleColumn'):
        link = link.by_tag('a')[0]
        link = link.attrs.get('href',"")
        link = abs(link, base=url.redirect or url.string)
        movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
示例#20
0
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    #print link

# The DOM object is a tree of Element and Text objects.
# All objects inherit from Node, DOM also inherits from Element.

# Node.type          => NODE, TEXT, COMMENT, ELEMENT, DOM
# Node.parent        => Parent Node object.
# Node.children      => List of child Node objects.
# Node.next          => Next Node in Node.parent.children.
# Node.previous      => Previous Node in Node.parent.children.

# DOM.head      => Element with tag name "head".
# DOM.body      => Element with tag name "body".

# Element.tag        => Element tag name, e.g. "body".
示例#21
0
#With the movie links, scrape each entry
#You will get the the following items:
#Produce a comma-separated text file (use semicolons to separate the entries) with a header row and the fields: 
#        Title of movie
#        Runtime
#        Genre (separated by semicolons if multiple)
#        Director(s)
#        Writer(s)
#        Actors (listed on the page directly only or first three, separated by semicolons)
#        Ratings
#        Number of Ratings


allElements = dom.by_tag("a")
for e in allElements:
    movieTitleLinks = re.match("http://www.imdb.com/title/.*", abs(e.attributes.get('href',''), base=url.redirect or url.string))

    # Follow the links
    if(movieTitleLinks):

        movieUrl = URL(movieTitleLinks.group(0))
        movieDom = DOM(movieUrl.download(cached=True))
        
        
        #=======================================================================
        # Get the title
        #=======================================================================
        for movie in movieDom.by_tag("title"):
            title = re.sub(' \(\d+\) - IMDb','', movie.content.encode('ascii','ignore').strip())

            
# Get the DOM object to scrape for movie links. [Hint: Use absolute URL's.
# Documentation can be found here: http://www.clips.ua.ac.be/pages/pattern-web] 
url = URL("http://www.opentable.com/promo.aspx?m=7&ref=470&pid=90")
dom = DOM(url.download(cached=True))

for restaraunt in dom.by_class("ResultRow"):
    name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' )
    neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' )
    neihgborhood_cuisine =  neighborhood_cuisine.split('|')
    neighborhood = neihgborhood_cuisine[0]
    cuisine = neihgborhood_cuisine[1]
    meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' )
    meals = meals.split('<')
    # need to clean
    meals = meals[0]   
    restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string))
    restDOM = DOM(restURL.download(cached=True))
    # need to clean
    address = restDOM.by_id("ProfileOverview_lblAddressText").content
    price = restDOM.by_id("ProfileOverview_lblPriceText").content
    try:
        ratings = restDOM.by_id("RestPopLabel_ReviewsFormat")[0].attributes
        ratings = ratings['title']
    except TypeError:
        ratings = 'not available'
    style = restDOM.by_id("ProfileOverview_DiningStyle").by_class("value")[0].content
    try:
        website = restDOM.by_id("ProfileOverview_Website").by_tag("a")[0].content
    except AttributeError:
        website = "not available"
    phone = restDOM.by_id("ProfileOverview_Phone").by_class("value")[0].content
示例#23
0
                name = name.encode('ascii', 'ignore')
            for j in g.by_class('category infoItem')[0:]:
                category = j.content
                category = plaintext(category)
                category = category.encode('ascii', 'ignore')
            if (g.by_class('tags infoItem')):
                tag = g.by_class('tags infoItem')[0].content
                tag = tag.encode('ascii', 'ignore')
            else:
                tag = " "
            for k in g.by_class('visits')[0:]:
                visits = k.content
                visits = visits[0:-6]
                visits = visits.encode('ascii', 'ignore')
            for l in g.by_class('description')[0:]:
                description = l.content
                description = description.encode('ascii', 'ignore')
            for link in g.by_tag('a')[1:2]:
                links = abs(link.attributes.get('href', ''),
                            base=url.redirect or url.string)
                ff.get(links + "/about")
                element = ff.find_element_by_class_name("row_count")
                time.sleep(8)
                element_text = element.text

## Write each row to the file
            writer.writerow([
                name, category, tag, visits, description, links, element_text
            ])

output.close()
示例#24
0
文件: 12-dom.py 项目: clips/pattern
for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah")[:5]: # Top 5 reddit entries.
    for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]:
        print(plaintext(a.content))
        print(a.attrs["href"])
        print("")

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attrs.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    print(link)

# The DOM object is a tree of nested Element and Text objects.
# All objects inherit from Node (check the source code).

# Node.type       : NODE, TEXT, COMMENT, ELEMENT or DOM
# Node.parent     : Parent Node object.
# Node.children   : List of child Node objects.
# Node.next       : Next Node in Node.parent.children.
# Node.previous   : Previous Node in Node.parent.children.

# DOM.head        : Element with tag name "head".
# DOM.body        : Element with tag name "body".

# Element.tag     : Element tag name, e.g. "body".
                name = h.content
                name = plaintext(name)
                name = name.encode('ascii', 'ignore')
            for j in g.by_class('category infoItem')[0:]:
                category = j.content
                category = plaintext(category)
                category = category.encode('ascii', 'ignore')
            if (g.by_class('tags infoItem')):
                tag = g.by_class('tags infoItem')[0].content
                tag = tag.encode('ascii', 'ignore')
            else: 
                tag = " "
            for k in g.by_class('visits')[0:]:
                visits = k.content
                visits = visits[0:-6]
                visits = visits.encode('ascii', 'ignore')
            for l in g.by_class('description')[0:]:
                description = l.content
                description = description.encode('ascii', 'ignore')
            for link in g.by_tag('a')[1:2]:
                links = abs(link.attributes.get('href', ''), base=url.redirect or url.string)
                ff.get(links+"/about")
                element = ff.find_element_by_class_name("row_count")
                time.sleep(8)
                element_text = element.text

## Write each row to the file
            writer.writerow([name, category, tag, visits, description, links, element_text])


output.close()
示例#26
0
#!/usr/bin/env python