def research_on(self, what, where): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0") dom = DOM(url.download(cached=True)) for a in dom.by_tag("div.main-title pj-on-autoload "): for e in a.by_tag("span.denombrement"): number_of_results = int( self.decode_if_unicode(plaintext(e.content))[:3]) number_of_page_results = number_of_results / 20 if (number_of_results % 20 > 0): number_of_page_results += 1 self.exctract_values(dom, self.myInfo) for i in range(2, number_of_page_results + 1): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0+" "&page=" + str(i)) dom = DOM(url.download(cached=True)) self.exctract_values(dom, self.myInfo) self.myInfo.sort_and_merge()
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Download the HTML file url = URL(url) html = url.download() # Parse the HTML file into a DOM representation dom = DOM(html) # Iterate through all 250 table rows on the index page for movies in dom('.lister-list > tr'): # take the movie's href attribute and put it in href href = movies('td.titleColumn a')[0].attrs["href"] # append the href attribute to the string, but also add http://www.imdb.com/ in front of it movie_urls.append("http://www.imdb.com/" + href) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = 'http://www.imdb.com/chart/top' top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] ''' Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list ''' for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href", "") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom('h3 a')[0].content) body = plaintext(dom('#contents')[0].content) return [title, body]
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached=True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_artist_docs(name): default_dir = basedir + name rap_docs = "" # get a list of all the files in default dir for f in os.listdir(default_dir): # go to that dir os.chdir(default_dir) # open the file fi = open(f, 'r') # print "reading " + f # slurp page = fi.read() # what does this do? dom = DOM(page) # we look at the page and get that the thing we want is in the .lyrics div. if dom and dom('.lyrics'): lyrics = dom('.lyrics')[0] else: continue p = plaintext(lyrics.content) rap_docs += p return rap_docs
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' print(url) url_html = url.download(cashed=True) url_dom = DOM(url_html) movie_urls = [] for movie in url_dom.by_class("titleColumn"): # looks for the element containing the link. movie_url = movie.by_tag("a")[0] # Gets a dictionary of the elements' attributes. movie_url = movie_url.attrs['href'] # Splits the string at the '?'. movie_url = movie_url.split('?') # Forms full url and appends to the list of movie urls movie_url = "http://www.imdb.com" + movie_url[0] movie_urls.append(movie_url) # return the list of URLs of each movie's page on IMDB return movie_urls
def setUp(self): with open(BACKUP_HTML, 'r') as f: dom = DOM(f.read()) # Add the header for now as the extract_tvseries function does not # add a header itself. self.rows = [['Title', 'Ranking', 'Genre', 'Actors', 'Runtime']] self.rows.extend(extract_tvseries(dom))
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) #return dom for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] dom = DOM(url.download(cached=True)) result = "http://imdb.com" for films in dom.by_tag("tbody.lister-list"): for urls in films.by_tag("td.titleColumn"): for url in urls.by_tag("a"): content = str(url).split('"') #print content[1] result += str(content[1]) movie_urls.append(result) result = "http://imdb.com" return movie_urls
def extract_incidents(dom): incident_list = [] i = 0 for incident in dom.by_tag('tr'): if i > 0: link = INCIDENT_URL + incident.by_tag('a')[0].href print link url = URL(link) html = url.download(timeout=100) dom_incident = DOM(html) weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')] weapons = ", ".join(weapons)[:-2] latitude = dom_incident.by_tag('p')[2].content[33:].strip() longitude = dom_incident.by_tag('p')[3].content[34:].strip() description = incident.by_tag('div')[0].content[1:].strip() date = incident.by_tag('td')[2].content[1:].strip() location = incident.by_tag('td')[3].content[1:].strip() violation = incident.by_tag('td')[4].content[1:].strip() incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')]) i += 1 return incident_list
def load_dom(url): r = requests.get(url) if r.status_code == 200: return DOM(r.content) return None
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn"): for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_patent_urls(keyword, limit=10): keyword = urllib.quote_plus(keyword) base_url = "http://www.lens.org" url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword) dom = DOM(url.download()) links = [base_url + a.attributes.get('href') for a in dom('a.link')] return links
def all_lyrics(artist): clean = re.sub(r"\s+|'", '', artist) url = URL(BASE_URL + artist[0] + '/' + clean + '.html') dom = DOM(url.download()) titles = [a.content for a in dom('div#listAlbum a')] ew_amazon = [ abs(link.attributes.get('href', ''), base=url.redirect or url.string) for link in dom('div#listAlbum a') ] songlinks = [l for l in ew_amazon if 'amazon' not in l] lyrics = [] for link in songlinks: song_url = URL(link) song_dom = DOM(song_url.download()) lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content)) zippy_lyrics = zip(titles, lyrics) return json.dumps(zippy_lyrics, sort_keys=True)
def get_dom_object(self, url_target): try: url = URL(url_target) dom_object = DOM(url.download(cached=True)) except: print('Problem retrieving data for this url: ', self.target_url_str) self.url_query_timeout = 1 return dom_object
def scrape(url): with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f: url = "http://www.top40.nl/top40/2015/week-46" week = url.split("/") week = week[-1] url = URL("http://www.top40.nl/top40/2015/week-46") dom = DOM(url.download(cached=True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix"): muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i, 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class("credit"): muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content, "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content, "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content, "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip(), "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n") # 1 positie # week-45 # ,1, # Traceback (most recent call last): # File "testhtmlscraper.py", line 58, in <module> # f.write(muziekGegevens + "\n") # TypeError: must be unicode, not str ??? f.close
def box_office_titles(): # download the webpage html = URL(BOX_OFFICE_URL).download() dom = DOM(html) # find the movie titles title_elements = dom(MOVIE_TITLE_TAG) titles = map(lambda x: x.content, title_elements) return titles
def extract_pic_url(self): dom = DOM(self.page_source) tag_list = dom('a.rg_l') for tag in tag_list[:self.image_dl_per_search]: tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href']) try: self.pic_url_list.append(tar_str.group(1)) except: print('error parsing', tag)
def main(): ''' Crawl the IMDB top 250 movies, save CSV with their information. Note: This function also makes backups of the HTML files in a sub-directory called HTML_BACKUPS (those will be used in grading). ''' # Create a directory to store copies of all the relevant HTML files (those # will be used in testing). print 'Setting up backup dir if needed ...' create_dir(BACKUP_DIR) # Make backup of the IMDB top 250 movies page print 'Access top 250 page, making backup ...' top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html) # extract the top 250 movies print 'Scraping top 250 page ...' url_strings = scrape_top_250(top_250_url) # grab all relevant information from the 250 movie web pages rows = [] for i, url in enumerate(url_strings): # Enumerate, a great Python trick! print 'Scraping movie %d ...' % i # Grab web page movie_html = URL(url).download(cached=True) # Extract relevant information for each movie movie_dom = DOM(movie_html) rows.append(scrape_movie_page(movie_dom)) # Save one of the IMDB's movie pages (for testing) if i == 83: html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i) make_backup(html_file, movie_html) # Save a CSV file with the relevant information for the top 250 movies. print 'Saving CSV ...' save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
def downloadContent(self): if not self.isWebPage(): raise URLError("Invalid or empty content type") try: self.content = self.url.download(timeout=1) except httplib.InvalidURL: raise URLError("Invalid URL") self.decodeContent() self.dom = DOM(self.content)
def inflect(word, language="italian"): inflections = {} url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") dom = DOM(URL(url).download(throttle=10, cached=True)) pos = "" # Search the header that marks the start for the given language: # <h2><span class="mw-headline" id="Italian">Italian</span></h2> e = dom("#" + language)[0].parent while e is not None: # e = e.next_sibling if e.type == "element": if e.tag == "hr": # Horizontal line = next language. break if e.tag == "h3": # <h3>Adjective [edit]</h3> pos = plaintext(e.content.lower()) pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-" # Parse inflections, using regular expressions. s = plaintext(e.content) # affetto m (f affetta, m plural affetti, f plural affette) if s.startswith(word): for gender, regexp, i in ( ("m" , r"(" + word + r") m", 1), ("f" , r"(" + word + r") f", 1), ("m" , r"(" + word + r") (mf|m and f)", 1), ("f" , r"(" + word + r") (mf|m and f)", 1), ("m" , r"masculine:? (\S*?)(,|\))", 1), ("f" , r"feminine:? (\S*?)(,|\))", 1), ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3), ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3), ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3), ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3), ( "p", r"(\(|, )plural (\S*?)(,|\))", 2), ( "p", r"m and f plural (\S*?)(,|\))", 1)): m = re.search(regexp, s, re.I) if m is not None: # {"adj-m": "affetto", "adj-fp": "affette"} inflections[pos + gender] = m.group(i) #print s e = e.next_sibling return inflections
def extract_percentages(dom): file_url = URL(TARGET_URL) file_dom = DOM(file_url.download()) percentage_list = [] if file_dom.by_class('percentage'): for item in file_dom.by_class('percentage'): percentage_list.append(item.content.encode('utf-8')) return percentage_list[0] else: return "nodata"
def extract_data_ML(i): url = 'http://macaulaylibrary.org/audio/%s' % i page = URL(url).download() dom = DOM(page) description = dom('meta')[0].attr['content'] result = [x.content for x in dom('script') if 'jwplayer(' in x.content][0] result = [ x.strip() for x in result.split('\n') if x.strip().startswith('file') ][0] path_to_mp3 = result.split('"')[1] return {'index': i, 'desc': description, 'mp3': path_to_mp3}
def create_dom_object(self): """ Create dom object based on element for scraping Take into consideration that there might be query problem. """ try: url = URL(self.full_url_str) self.dom_object = DOM(url.download(cached=True)) except: if self.__print_url_finding_error: print 'Problem retrieving data for this url: ', self.full_url_str self.url_query_timeout = 1
def get_dom_object(self, url_target): try: session = HTMLSession() # get the html content response = session.get(url_target) # execute Java-script response.html.render(timeout=30, sleep=2) dom_object = DOM(response.html.html) return dom_object except: self.ErrorReason = 'Problem retrieving data for this url: ' + url_target + '.\nPlease check your Internet connection.' return None
def extract_pic_url(self): """ extract all the raw pic url in list """ dom = DOM(self.page_source) tag_list = dom('a.rg_l') print len(tag_list) for tag in tag_list[:self.nb_images]: tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href']) try: self.pic_url_list.append(tar_str.group(1)) except: print 'error parsing', tag
def conjugate(verb, language="italian"): url = URL("http://en.wiktionary.org/wiki/%s" % verb) dom = DOM(url.download(throttle=10, cached=True)) conj = {"infinitive": verb} mood = None for table in dom("table.inflection-table"): # Search the header that marks the start for the given language: # <h2><span class="mw-headline" id="Italian">Italian</span></h2> h2 = table.parent.parent while h2: h2 = h2.previous if getattr(h2, "tag", "") == "h2" and \ getattr(h2("span")[0], "id", "") != language: continue for tr in table("tr"): for th in tr("th"): # <th>indicative</th> if th.content in MOOD: mood = th.content # <th>present</th><td>sono</td><td>sei></td>... if th.content in TENSE: conj[th.content, mood] = [plain(td.content) for td in tr("td")] # <th>gerund</th><td>essendo</td> if th.content in PARTICIPLE: conj[th.content] = plain(th.next.next.content) # <th>imperative</th></tr><tr><td></td><td>sii</td>... if mood == "imperative" and len(tr("th")) == 0: conj["present", mood] = [plain(td.content) for td in tr("td")] return conj return {}
def extract_images_url(self): #Initialize Chrome Webdriver using Selenium. driver = webdriver.Chrome("/usr/local/bin/chromedriver") driver.get(self.url_search) #Scroll around google page. init_position = 0 move_to = 200000 not_find = False for scroll in range(30): window_scroll = "window.scrollBy(" + str( init_position) + "," + str(move_to) + ")" driver.execute_script(window_scroll) time.sleep(0.2) init_position = move_to move_to = move_to + 100000 #Find the "show more results button" try: #Click the "Show more results" driver.find_element_by_xpath("//input[@type='button']").click() print("Click!") except: continue time.sleep(0.5) self.driver_source = driver.page_source #Retrieve the different images-url from the google page. dom = DOM(self.driver_source) tag_list = dom('a.rg_l') print("Total images retrieved: " + str(len(tag_list))) #Avoid trying to retrieve more images that the ones that google allows. if (self.num_images > len(tag_list)): self.num_images = len(tag_list) #Only allow a maximum number of images defined by, self.num_images. for tag in tag_list[:self.num_images]: tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href']) try: self.images_url_list.append(tar_str.group(1)) except: print('error parsing', tag) #Print number of images that you were able to download #(always a little bit less than self.num_images, since Exceptions and permission problems to some websites.) print("\nTotal number of URL images: " + str(len(self.images_url_list))) #Close the Google Chrome Webdriver. driver.quit()