def extract_incidents(dom): incident_list = [] i = 0 for incident in dom.by_tag('tr'): if i > 0: link = INCIDENT_URL + incident.by_tag('a')[0].href print link url = URL(link) html = url.download(timeout=100) dom_incident = DOM(html) weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')] weapons = ", ".join(weapons)[:-2] latitude = dom_incident.by_tag('p')[2].content[33:].strip() longitude = dom_incident.by_tag('p')[3].content[34:].strip() description = incident.by_tag('div')[0].content[1:].strip() date = incident.by_tag('td')[2].content[1:].strip() location = incident.by_tag('td')[3].content[1:].strip() violation = incident.by_tag('td')[4].content[1:].strip() incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')]) i += 1 return incident_list
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' dom = DOM(URL(TARGET_URL).download()) # global list for storing all series information series_list = [] # collect all info of the series, one series at a time for l in range(NUMBER_OF_SERIES): # temporary variables to make strings genre = '' credit = '' # get rank for each series rank = dom.by_tag("tr.detailed")[l].by_tag("span.value")[0].content # get runtime for each series time = dom.by_tag("span.runtime")[l] time = plaintext(time.content)[:-5] # get all genres for each series for m in dom.by_tag("span.genre")[l].by_tag("a"): genre += m.content + ", " genre = genre[:-2].encode('ascii', 'ignore').decode('ascii') # get all actors for each series for m in dom.by_tag("span.credit")[l].by_tag("a"): credit += m.content + ", " credit = credit[:-2].encode('ascii', 'ignore').decode('ascii') # get title for each series title = dom.by_tag("tr.detailed")[l].by_tag("a")[1].content # store info for each series series = [title, rank, genre, credit, time] series_list.append(series) return series_list
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href", "") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_starrtest(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM #sciend_num = dom.by_class("rm")[4].content scicst_num = dom.by_class("rm")[3].content math_num = dom.by_class("rm")[2].content hist_num = dom.by_class("rm")[1].content ela_num = dom.by_class("rm")[0].content #sciend_percent = dom.by_class("rs")[4].content[:5] scicst_percent = dom.by_class("rs")[3].content[:5] math_percent = dom.by_class("rs")[2].content[:5] hist_percent = dom.by_class("rs")[1].content[:5] ela_percent = dom.by_class("rs")[0].content[:5] county = dom.by_tag("h2")[0].content # write all the collected data to a new row of the output file writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn"): for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Pak de html van de url en maak er een DOM van html = url.download() dom = DOM(html) # Elke url begint met deze root, deze root is nodig voor het absolute pad root = 'http://www.imdb.com' # De url van elke film zit in een td tag met class titleColumn for movie in dom.by_class("titleColumn"): # Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken movieinfo = DOM(movie.content) # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href","")) # return the list of URLs of each movie's page on IMDB return movie_urls
def getReviews(self): params = { 'id' : "comments", 'oid' : 0, 'showAll' : 'yes' } reviews = [] i=0 for rs in self.conn.resturants.find(): reviews = [] if not rs.get('reviews'): oid = str(rs['url']).split('=')[1] params['oid'] = oid req = DOM(URL(self.xmlUrl, query=params).download()) for item in req.by_tag('item'): if item.by_tag('description'): content = plaintext(item.by_tag('description')[0].content) reviews.append(self.parseReview(content)) # print reviews[0:3] rs['reviews'] = reviews self.conn.resturants.save(rs) print 'saved reviews for', rs['name'] else: print 'already have reviews for', rs['name']
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] dom = DOM(URL(TOP_250_URL).download()) #set domain for td in dom.by_tag("td.titleColumn")[:250]: #loop over movies for a in td.by_tag("a"): a = str(a) a = a.split('"') link = "http://www.imdb.com" + a[1] movie_urls.append(link) print movie_urls # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_beer_info_urls(url): ''' Scrape the top 30 beer discounts from Yenom.com ''' # Download the HTML file html = url.download() # Parse the HTML file into a DOM representation dom = DOM(html) table = dom.by_tag("table.hikashop_products_table adminlist table table-striped table-hover")[0] i = 0 info_urls = [] # Loop through all beer discounts for listItem in table.by_tag("tr")[1:]: print print i i += 1 print # Get URL links = listItem.by_tag("a") # Some of the rows in the table are separators between supermarkets so they do not have a link if len(links) > 0: #print Links[0].content.encode("utf-8") print HOME_URL + links[0].attrs["href"] info_urls.append(HOME_URL + links[0].attrs["href"]) # return the list of URLs for each info page return info_urls """
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href","") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] dom = DOM(url.download(cached=True)) result = "http://imdb.com" for films in dom.by_tag("tbody.lister-list"): for urls in films.by_tag("td.titleColumn"): for url in urls.by_tag("a"): content = str(url).split('"') #print content[1] result += str(content[1]) movie_urls.append(result) result = "http://imdb.com" return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' from pattern.web import abs movie_urls = [] html = url.download(cached=True) dom = DOM(html) for a in dom.by_tag("tbody.lister-list"): for b in a.by_tag("td.titleColumn"): for c in b.by_tag("a"): link = c.attrs.get("href","") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # return the list of URLs of each movie's page on IMDB return movie_urls
def obtain_data(url): ''' Scrape the Wikipedia page. Args: url: pattern.web.URL instance pointing to the Wikipedia page Returns: A list of lists, where each sublist represents a data point. Each sublist contains two elements: a string with the name of the country, and a string with the size of the population of that country. ''' # Create a DOM of the URL. html = url.download(cached=True) dom = DOM(html) data_points = [] for countries_table in dom.by_tag("table.wikitable sortable"): for table_row in countries_table.by_tag("tr")[1:]: # The first row is the header, so start at index 1. table_row_content = [] # Obtain the content of the row. for table_row_cell in table_row.by_tag("td"): table_row_cell_content = unicode(plaintext(table_row_cell.content)) table_row_content.append(table_row_cell_content) # Obtain the country name and the population size. country = table_row_content[1].split("[")[0].split(" (")[0] population = "".join(table_row_content[2].split(",")) data_point = [country, population] data_points.append(data_point) return data_points
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn")[:1]: for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] dom = DOM(url.download()) from pattern.web import abs url = URL("http://imdb.com") for x in dom.by_tag("td.titleColumn"): x = x.by_tag("a")[0] x = x.attrs.get("href","") x = abs(x, base=url.redirect or url.string) # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # Grab web page movie_html = URL(url).download(cached=True) # Extract relevant information for each movie movie_dom = DOM(movie_html) for a in movie_dom.by_tag("td.titleColumn"): for b in a.by_tag("a"): b = str(b) title = b.split('"')[1] url = "http://www.imdb.com", b.split('"')[1] urly = "".join(url) movie_urls.append(urly) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = 'http://www.imdb.com/chart/top' top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] ''' Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list ''' for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. movie_urls = [] index_html = URL(url).download(cached=True) index_dom = DOM(index_html) # Get all information from IMDB for i in index_dom.by_tag("td.titleColumn")[:250]: # Get title and append in tvserieslist for j in i.by_tag("a")[:1]: url = j.attributes["href"] #movie_urls.append(str(title[0])) movie_urls.append("http://www.imdb.com" + url) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. #dom = DOM(url) dom = DOM(URL(TOP_250_URL).download()) #print plaintext(dom.by_tag("td.titleColumn")[0].content) x = 0 # extract links to the movie pages of the movies in the imdb top 250 for i in range(250): # extract link to movie page for each movie for data in dom.by_tag("td.titleColumn")[i].by_tag("a"): data = str(data) relative_path = data.split('"')[1] link = 'http://www.imdb.com' + relative_path movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def research_on(self, what, where): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0") dom = DOM(url.download(cached=True)) for a in dom.by_tag("div.main-title pj-on-autoload "): for e in a.by_tag("span.denombrement"): number_of_results = int( self.decode_if_unicode(plaintext(e.content))[:3]) number_of_page_results = number_of_results / 20 if (number_of_results % 20 > 0): number_of_page_results += 1 self.exctract_values(dom, self.myInfo) for i in range(2, number_of_page_results + 1): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0+" "&page=" + str(i)) dom = DOM(url.download(cached=True)) self.exctract_values(dom, self.myInfo) self.myInfo.sort_and_merge()
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] html = url.download() dom = DOM(html) # Search for the list of movies movie_list = dom.by_tag('tbody.lister-list')[0] # Get the title column for each movie for the url for movie in movie_list.by_tag('td.titleColumn'): movie_urls.append('http://www.imdb.com' + movie('a')[0].attrs['href']) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached=True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Create a DOM of the URL. html = url.download(cashed=True) dom = DOM(html) for movie_table in dom.by_tag("table.chart full-width"): for movie_table_row in movie_table.by_tag("tr")[1:251]: # The first row is redundant, so start from index 1. for movie_table_row_cell in movie_table_row.by_tag("td.titleColumn"): for a in movie_table_row_cell.by_tag("a"): # Obtain the path of the URL to the movie's page, create an absolute URL, and append it to the list 'movie_urls'. movie_url_path = a.attrs["href"] absolute_movie_url = "".join(["http://www.imdb.com/", movie_url_path]) movie_urls.append(absolute_movie_url) # Return the list of URLs of each movie's page on IMDB. return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # dwonload html page of url index_html = URL(url).download(cached=True) # create dom structure of index.html index_dom = DOM(index_html) # loops over every movie for td in index_dom.by_tag("td.titleColumn")[:250]: # extracts the url of the movie for a in td.by_tag("a")[:1]: a = str(a) path = a.split('"')[1] domain = "http://imdb.com" movie_url = domain + path movie_urls.append(movie_url) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # initieer movie_html en en movie_dom naar imdb top 250 site movie_html = URL(url).download(cached=True) movie_dom = DOM(movie_html) # zoek op de site naar td.titlecolumn waar link in zit for films in movie_dom.by_tag("td.titleColumn"): # zoek link in td.titlecolumn link = films.by_tag('a')[0] # maak abslote path en voeg het toe aan de lijst movies_urls link = "http://www.imdb.com" + link.attrs.get("href","") movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): """ Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). """ # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = "http://www.imdb.com/chart/top" top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] """ Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list """ for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def make_json(url): json_dict = {} # Geef de data een titel json_dict["data"] = "percentage renewable energy" # Pak de DOM van de tabel van alle landen html = url.download() dom = DOM(DOM(html).by_class("wikitable")[1].content) # Maak een list met info over de landen countrylist = dom.by_tag("tr")[1:] # Lege list om de data aan te appenden pointslist = [] for countryinfo in countrylist: # Lege list om land en percentage renewable energy aan te appenden infopair = [] # Neem de naam van het land en append dat aan infopair infopair.append(DOM(countryinfo.content).by_tag("a")[0].attrs.get("title", "").encode("utf-8")) # Neem het percentage renewable energy van het land en append dat aan infopair infopair.append(DOM(countryinfo.content).by_tag("td")[8].content.encode("utf-8")) # Append de list aan pointslist voor een nested list pointslist.append(infopair) # Geef de dictionary de key 'points' met value de nested list pointslist json_dict["points"] = pointslist # Dump de dictionary als JSON naar de textfile json.txt json.dump(json_dict, open("json.txt", "wb"))
class fbMessageDump(MessageDump): def __init__(self, dump, p1, p2 = None, **kwargs): super(fbMessageDump, self).__init__(dump, p1, **kwargs) def construct_dump(self): f = open(self.dump, "r") self.dump = DOM(f.read()) f.close() def construct_threads(self): for i in self.dump.by_tag("div.thread"): cur_thread = msg_classes.Thread() cur_thread.p1 = self.p1 thread_exists = False if plaintext(i.by_tag("span.profile fn")[0].content) == self.p1: cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[1].content) else: cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[0].content) # TODO if p1 and p2 have the same name, error! # assert cur_thread.p1 != cur_thread.p2 for e in i.by_tag("div.message"): cur_thread.add_message( plaintext(e.by_tag("div.from")[0].content).encode("utf-8"), e.by_tag("abbr.time published")[0].attributes['title'].encode("utf-8"), plaintext(e.by_tag("div.msgbody")[0].content).encode("utf-8") ) cur_thread.construct_conversations() for t in self.threads: if t.p2 == cur_thread.p2: thread_exists = True t.combine(cur_thread) if not thread_exists: self.threads.append(cur_thread)
def extract_tvseries(dom): url = URL(TARGET_URL) dom = DOM(url.download(cached=True)) #print dom.body.content x = 0 csv_row = [] for series in dom.by_tag('td.title'): title = series.by_tag('a')[0].content.encode('ascii', 'ignore') ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore') genres = series.by_tag('span.genre')[0].by_tag('a') genres = [g.content.encode('ascii', 'ignore') for g in genres] actors = series.by_tag('span.credit')[0].by_tag('a') actors = [a.content.encode('ascii', 'ignore') for a in actors] x = x + 1 try: runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore') except: runtime = "Unknown" #print x, title, ranking, genres, actors, runtime csv_titles = title csv_ranking = ranking csv_genres = genres csv_actors = actors csv_runtime = runtime row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime] csv_row.append(row) return csv_row
def scrape(url): with io.open("allMusicOneWeek.csv", "w",encoding = "utf8") as f: url = "http://www.top40.nl/top40/2015/week-46" week = url.split("/") week = week[-1] url = URL("http://www.top40.nl/top40/2015/week-46") dom = DOM(url.download(cached = True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix"): muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i , 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class("credit"): muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content , "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content , "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content , "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip() , "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n") # 1 positie # week-45 # ,1, # Traceback (most recent call last): # File "testhtmlscraper.py", line 58, in <module> # f.write(muziekGegevens + "\n") # TypeError: must be unicode, not str ??? f.close
def scrape(url): with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f: url = "http://www.top40.nl/top40/2015/week-46" week = url.split("/") week = week[-1] url = URL("http://www.top40.nl/top40/2015/week-46") dom = DOM(url.download(cached=True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix"): muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i, 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class("credit"): muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content, "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content, "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content, "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip(), "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n") # 1 positie # week-45 # ,1, # Traceback (most recent call last): # File "testhtmlscraper.py", line 58, in <module> # f.write(muziekGegevens + "\n") # TypeError: must be unicode, not str ??? f.close
def extract_tvseries(dom): url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series") dom = DOM(url.download(cached = True)) # create two arrays to make a list at the end to write off infoserie = [] infoSerieList = [] a = '' for e in dom.by_tag("tr.detailed")[:50]: # Top 50 imdb entries. # get title for a in e.by_tag("a")[:1]: # First <a class="title"> in entry. infoserie = [] s = a.attrs["title"] infoserie += [s.split('(')[0].strip()] # get rating for rating in e.by_tag("div.rating-list")[:1]: rating = rating.attrs["title"] infoserie +=[rating[17:20]] # get genre for genre in e.by_tag("span.genre")[:1]: for m in genre.by_tag("a"): infoserie += [m.content] # get actors for actors in e.by_tag("span.credit"): for actors_sub in actors.by_tag("a"): infoserie +=[actors_sub.content] #get time for time in e.by_tag("span.runtime")[:1]: infoserie += [time.content[:3]] infotopserie =[] # encode to get rid of unicode error for encoding in infoserie: infotopserie += [encoding.encode('utf-8')] # add row to list infoSerieList.append(infotopserie) ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. return infoSerieList # replace this line as well as appropriate
def scrape_movie_page(dom): ''' Scrape the IMDB page for a single movie Args: dom: pattern.web.DOM instance representing the page of 1 single movie. Returns: A list of strings representing the following (in order): title, year, duration, genre(s) (semicolon separated if several), director(s) (semicolon separated if several), writer(s) (semicolon separated if several), actor(s) (semicolon separated if several), rating, number of ratings. ''' # YOUR SCRAPING CODE GOES HERE: dom = DOM(url.download()) #print dom.body.content csv_row = [] for series in dom.by_tag('td.titleColumn'): title = series.by_tag('a')[0] ranking = series.by_tag('td.ratingColumn')[0] genres = series.by_tag('span.genre')[0].by_tag('a') genres = [g.content for g in genres] actors = series.by_tag('span.credit')[0].by_tag('a') actors = [a.content for a in actors] try: runtime = series.by_tag('span.runtime')[0] except: runtime = "Unknown" csv_titles = title csv_ranking = ranking csv_genres = genres csv_actors = actors csv_runtime = runtime row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime] csv_row.append(row) return csv_row print title; ''' Geen idee hoe dit werkt, Python ligt mij, ik heb met moeite vorige week die opdracht afgekregen in mijn ogen is het precies hetzelfde als vorige week, maar het werkt niet. Bij deze dus een incomplete opdracht. Ik hoop niet dat ik daardoor de module niet meer kan halen... Volgende keer beter, JS ligt mij toch iets meer dan Python om eerlijk te zijn. ''' # Return everything of interest for this movie (all strings as specified # in the docstring of this function). return title, duration, genres, directors, writers, actors, rating, \ n_ratings
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. dom = DOM(TARGET_URL.download(cached=True)) # Get top 50 results for e in dom.by_tag("td.title"): # get title for a in e.by_tag("a")[:1]: title = plaintext(a.content) print title print # get ranking for td in e.by_tag("span.value")[:1]: ranking = plaintext(td.content) print ranking print # get genre for span in e.by_tag("span.genre")[:1]: genre = plaintext(span.content) print genre print # get actors/actresses for span in e.by_tag("span.credit")[:1]: actors = plaintext(span.content) print actors print # get runtime (number) for span in e.by_tag("span.runtime")[:1]: runtime = plaintext(span.content) print runtime print # create a dictionary of all the retrieved info showlist[e] = {title, ranking, genre, actors, runtime}
def scrape_gradrate(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/cohortrates/CRByGender.aspx?cds=01000000000000&TheYear=2011-12&Agg=O&Topic=Dropouts&RC=County&SubGroup=Ethnic/Racial' url = 'http://dq.cde.ca.gov/dataquest/cohortrates/CRByGender.aspx?cds='+county_num+'000000000000&TheYear=2011-12&Agg=O&Topic=Dropouts&RC=County&SubGroup=Ethnic/Racial' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM grad_percent = dom.by_tag("tr")[-1].by_tag("td")[4].content total_dropouts = dom.by_tag("tr")[-1].by_tag("td")[5].by_tag("span")[0].content total_grads = dom.by_tag("tr")[-1].by_tag("td")[3].by_tag("span")[0].content total_num = dom.by_tag("tr")[-1].by_tag("td")[2].by_tag("span")[0].content county = dom.by_tag("h2")[0].by_tag("span")[0].content[26:] # write all the collected data to a new row of the output file writer.writerow([county, total_num,total_grads, total_dropouts, grad_percent])
def scrape(url, f): week = url.split("/") week = week[-1] url = URL(url) dom = DOM(url.download(cached=True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix")[0:40]: muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i, 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class( "credit"): #error niet te veel elementen! muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content, "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content, "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content, "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip(), "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n")
def scrape(url,f): week = url.split("/") week = week[-1] url = URL(url) dom = DOM(url.download(cached = True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix")[0:40]: muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i , 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class("credit"): #error niet te veel elementen! muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content , "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content , "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content , "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip() , "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n")
def get_countries(exceptions): """ Get the population density and ISO CODE 3166 for every country with manually added exceptions. """ url = URL("https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_population_density") html = url.download() dom = DOM(html) country_table = dom.by_tag('table.wikitable') countries = {} # Get every tablerow that got a country in it. for country in country_table[0]('tr')[4:-1]: # Some come with extra's added, which makes them put it in a <i> tag. try: link = country('i')[0]('a')[0].attrs['href'] name = country('i')[0]('a')[0].content.encode('ascii', 'ignore') except: if len(country('span')) == 0: link = country('td')[0]('a')[0].attrs['href'] name = country('td')[0]('a')[0].content.encode('ascii', 'ignore') else: link = country('td')[1]('a')[0].attrs['href'] name = country('td')[1]('a')[0].content.encode('ascii', 'ignore') density = country('td')[5].content.replace(',', '') # Not every wikipedia page is the same or got a ISO CODE at all. try: iso_code = retrieve_iso("https://en.wikipedia.org", link) except: iso_code = 'ERROR' # Add in manually added ISO CODES. if iso_code == 'ERROR' and name in exceptions: iso_code = exceptions[name] if iso_code == 'CY': # SVG doesn't contain northern cyprus unlike wikipedia, so I add it manually. density = 125 print iso_code, [name], density if iso_code != 'ERROR': countries[iso_code] = float(density) with open('data.txt', 'w') as outfile: json.dump(countries, outfile, indent=4) return countries
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # retrieve TOP 250 DOM dom = DOM(url.download(cached=True)) # add all absolute movie URLs to the list for movie in dom.by_tag("td.titleColumn"): movie_urls.append('http://www.imdb.com' + movie.by_tag('a')[0].href) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' dom = DOM(url.download(cached=True)) movie_urls = [] # find the url of the separate movies and put them in a list for e in dom.by_tag("td.titleColumn")[:250]: for a in e.by_tag("a"): site = "http://www.imdb.com" site += a.attrs["href"] movie_urls.append(URL(site)) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. html = url.download(cached=True) dom = DOM(html) # Finds the movie URLs by selecting the first "table.chart" and then the elements "td.titleColumn" with their first links. e = dom.by_tag("table.chart")[0] for c in e.by_tag("td.titleColumn"): movie_urls.append(([URL("http://" + url.domain + a.attrs.get("href","")) for a in c.by_tag("a")][0])) # return the list of URLs of each movie's page on IMDB return movie_urls
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE from pattern.web import Element, download import urllib import datetime #libraries to check urllib (legacy vs not), pattern, requests url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php") dom = DOM(url.download(cached = True)) i = 0 try : for e in dom.by_tag('img'): if (extension(e.attr['src']) == '.jpg'): print(e.attr['src']) urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i)) #image = download(e.attr['src'], unicode= False, timeout= 5) #f = open("data/test/pattern{0}.jpg".format(i), 'wb') #f.write(image) i += 1 except: print ("error") """ image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg" url = URL(image) print (url.mimetype in MIMETYPE_IMAGE) urllib.request.urlretrieve(image, 'data/test2.jpg') """
ff = webdriver.Firefox() ff.implicitly_wait(30) ## The datasets in the portal are spread over several pages, this loop ## instructs the scraper to go through each page. ## The html for the pages just updates the page number, so the URL() code ## keeps track of p and uses it to specify the page number for p in range(1, 12): url = URL( 'https://data.cityofchicago.org/browse?limitTo=datasets&sortBy=oldest&utf8=%E2%9C%93&page=' + str(p)) dom = DOM(url.download(cached=True)) ## The list of datasets is in a table, so this loop cycles through the row ## elements to scrape each dataset. for i in dom.by_tag('tbody')[0:]: for g in i.by_tag('tr')[0:]: for h in g.by_tag('a.name')[0:]: name = h.content name = plaintext(name) name = name.encode('ascii', 'ignore') for j in g.by_class('category infoItem')[0:]: category = j.content category = plaintext(category) category = category.encode('ascii', 'ignore') if (g.by_class('tags infoItem')): tag = g.by_class('tags infoItem')[0].content tag = tag.encode('ascii', 'ignore') else: tag = " " for k in g.by_class('visits')[0:]:
# The DOM elements can then be searched by tag name, CSS id, CSS class, ... # For example, top news entries on Reddit are coded as: # <div class="_1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah"> # ... # <span class="y8HYJ-y_lTUHkQIc1mdCq yj3st6-1 kYJFRo"> # ... # <a class="SQnoC3ObvgnGjWt90zD9Z " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a> # ... # </div> # # ... which - naturally - is a picture of a cat. url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) #print(dom.body.content) for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah" )[:5]: # Top 5 reddit entries. for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]: print(plaintext(a.content)) print(a.attrs["href"]) print("") # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attrs.get("href", "")
# The DOM (Document Object Model) parses a string of HTML # and returns a tree of nested Element objects. # The DOM elements can then be searched by tag name, CSS id, CSS class, ... # For example, top news entries on Reddit are coded as: # <div class="entry"> # <p class="title"> # <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a> # ... # </div> # # ... which - naturally - is a picture of a cat. url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) #print dom.body.content for e in dom.by_tag("div.entry")[:5]: # Top 5 reddit entries. for a in e.by_tag("a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attrs["href"] print # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string)
import csv from pattern.web import URL, DOM, plaintext, strip_between from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT #For the 2013 datasheet, use this code: url = URL( 'http://www.satp.org/satporgtp/countries/pakistan/database/majorincidents.htm' ) dom = DOM(url.download(cached=True)) myarray = [] tab = dom.by_tag('table') for i in dom.by_tag('td')[11:]: g = i.content h = plaintext(g) myarray.append(h) def chunks(l, n): return [l[i:i + n] for i in range(0, len(l), n)] yes = chunks(myarray, 5) output = open("satpincidents2013.csv", "wb") writer = csv.writer(output) for i in yes[0:]:
stop_str = open('api_data_store.xml', 'r').read() dom = DOM(stop_str) # Write the beginning writer = Writer("db.json") writer.beginObj() # Write the stops print "Writing stops..." writer.writeKey("stops") writer.beginArray() namesToIDs = dict() stops = dom.by_tag("stop") for stop in stops: # Get the stop id, name, title, and the stop's routes stop_id = stop.attributes["s_id"] name = stop.by_tag("title")[0].content title = "i-Lab" if name == "HiLab-HBS" else name routes = stop.by_tag("stop_routes")[0].content.split(",") # Map names to stop ids namesToIDs[name] = stop_id # Write out the stop to the db writer.beginObj() writer.writeKeyVal("id", stop_id, True) writer.writeKeyVal("name", name, True) writer.writeKeyVal("title", title, True)
players = ["3975/stephen-curry", "9/ray-allen", "552/reggie-miller", "841/jason-terry", "662/paul-pierce", "429/jason-kidd", "136/vince-carter", "165/jamal-crawford", "63/chauncey-billups", "2011/kyle-korver", "469/rashard-lewis", "813/peja-stojakovic", "1007/joe-johnson", "110/kobe-bryant"] htmllink = "http://espn.go.com/nba/player/stats/_/id/" output_file = open('new3pointers.json', 'w') # get data for all players for player in players: TARGET_URL = URL(htmllink + player) dom = DOM(TARGET_URL.download(cached=True)) dataofyear = list() tempdata= dict() # loop over the html table for e in dom.by_tag("div.mod-container mod-table mod-player-stats"): for a in e.by_tag("div.mod-content")[1:2]: for tablehead in a.by_class("tablehead"): year = -1 for oddrow in tablehead.by_class("oddrow"): madeshots = oddrow[4].content[:3] madeshots = int(madeshots.replace("-", "")) year += 2 percentage = float(oddrow[5].content) tempdata["year"] = year tempdata["tot3fg"] = madeshots tempdata["percentage"] = percentage
for row in rows: self.writerow(row) # Creating the csv output file for writing into as well as defining the writer output = open("data_output_Trulia_HP.csv", "wb") writer = UnicodeWriter(output) writer.writerow( ["State", "County", "Average Listing Price", "Median Sales Price"]) # get the DOM object to scrape for links url = URL("http://www.trulia.com/home_prices/") dom = DOM(url.download(cached=True)) # get the rows where all info is contained all_data_rows = dom.by_tag("tr") # define the variable to store all the trulia data all_trulia_data = [] # loop through each row for ind_data_row in all_data_rows: if (ind_data_row.attributes.get("style", "") == 'background-color: #FFFFFF;' or ind_data_row.attributes.get( "style", "") == 'background-color: #EDEFF2;'): all_columns = ind_data_row.by_tag("td") state = plaintext(all_columns[0].by_tag("a")[0].content) avg_listing_price = plaintext(all_columns[1].content) median_sales_price = plaintext(all_columns[2].content)