def process_page(): url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series") dom = DOM(url.download(cached=True)) domIndex = 0 for title in dom.by_class("title"): theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace') titleCatalog.append(Title(theTitle)) try: match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) #print match.group(1) # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) titleCatalog[domIndex].addRunTime(match.group(1)) except Exception, e: pass try: titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace')) except Exception, e: pass
def extract_percentages(dom): file_url = URL(TARGET_URL) file_dom = DOM(file_url.download()) percentage_list = [] if file_dom.by_class('percentage'): for item in file_dom.by_class('percentage'): percentage_list.append(item.content.encode('utf-8')) return percentage_list[0] else: return "nodata"
def scrape_api(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=02' url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=' + county_num abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM #grab the value for each district and sum them up to obtain the county total value districts = dom.by_class('medium\+_left') num_students_county_total = 0 api_county_total = 0 for n in districts: #grab and sum number of students district_num_students = n.parent.by_class("medium_center")[0].content if not " " in district_num_students: #cast to int district_num_students = int(district_num_students.replace(',','')) num_students_county_total += district_num_students #grab and sum API for each district district_api = n.parent.by_class("medium_center")[1].content #remove any asterii district_api = district_api.replace('*','') #cast to int district_api = int(district_api.replace(',','')) #add the API weighted by the number of students in the current district api_county_total += district_api*district_num_students #divide the weighted sum of APIs by the total number of students in the county average_api = api_county_total/num_students_county_total API_num_students = dom.by_class('medium\+_left')[0].parent.by_class("medium_center")[0].content #use county number as a placeholder for the county name for now, as the county name is not easily scrapable county = county_num # write all the collected data to a new row of the output file writer.writerow([str(county), str(num_students_county_total),str(average_api)])
def get_by_year(year): url = URL("http://www.imdb.com/event/ev0000003/" + str(year)) dom = DOM(url.download(cached=True)) dictAll = {} awards = dom.by_class('award') awardTitles = awards[0].by_tag('h2') awardList = [] for award in awardTitles: awardList.append(award.content) prize = awards[0].by_tag('blockquote') for index, title in enumerate(prize[1:25]): winner = title.by_tag('strong')[0].by_tag('a')[0].content winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1]) nomineeList = [] for each in title.by_tag('strong')[1::]: name = each.by_tag('a')[0].content id = str(each.by_tag('a')[0].attrs['href'][-8:-1]) nomineeList.append((clean_unicode(name),id)) winnersAndNominees = {} winnersAndNominees['winner'] = (clean_unicode(winner),winner_id) winnersAndNominees['nominees'] = nomineeList dictAll[awardList[index]] = winnersAndNominees return dictAll
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Pak de html van de url en maak er een DOM van html = url.download() dom = DOM(html) # Elke url begint met deze root, deze root is nodig voor het absolute pad root = 'http://www.imdb.com' # De url van elke film zit in een td tag met class titleColumn for movie in dom.by_class("titleColumn"): # Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken movieinfo = DOM(movie.content) # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href","")) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' print(url) url_html = url.download(cashed=True) url_dom = DOM(url_html) movie_urls = [] for movie in url_dom.by_class("titleColumn"): # looks for the element containing the link. movie_url = movie.by_tag("a")[0] # Gets a dictionary of the elements' attributes. movie_url = movie_url.attrs['href'] # Splits the string at the '?'. movie_url = movie_url.split('?') # Forms full url and appends to the list of movie urls movie_url = "http://www.imdb.com" + movie_url[0] movie_urls.append(movie_url) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_starrtest(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM #sciend_num = dom.by_class("rm")[4].content scicst_num = dom.by_class("rm")[3].content math_num = dom.by_class("rm")[2].content hist_num = dom.by_class("rm")[1].content ela_num = dom.by_class("rm")[0].content #sciend_percent = dom.by_class("rs")[4].content[:5] scicst_percent = dom.by_class("rs")[3].content[:5] math_percent = dom.by_class("rs")[2].content[:5] hist_percent = dom.by_class("rs")[1].content[:5] ela_percent = dom.by_class("rs")[0].content[:5] county = dom.by_tag("h2")[0].content # write all the collected data to a new row of the output file writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def process_page(): url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series") dom = DOM(url.download(cached=True)) domIndex = 0 for title in dom.by_class("title"): theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace') #print theTitle #titleCatalog.append(Title(title.by_tag("a")[0].content)) titleCatalog.append(Title(theTitle)) try: # print dom.by_class("runtime")[domIndex].content titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) except: pass try: # print dom.by_class("value")[domIndex].content titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace')) except: pass try: for genre in dom.by_class("genre")[domIndex].by_tag("a"): # print genre.content titleCatalog[domIndex].addGenre( str(genre.content).encode('ascii', 'replace')) except: pass try: for credit in dom.by_class("credit")[domIndex].by_tag("a"): # print credit.content titleCatalog[domIndex].addActors( str(credit.content).encode('ascii', 'replace')) except: pass domIndex += 1
def scrape_page(url): html = url.download() dom = DOM(html) table = DOM(dom.by_class("wikitable")[0].content) countrylist = table.by_tag("tr")[1:] pointsdict = {} for c in countrylist: infodict = {} infodict["name"] = c.by_tag("a")[-1].content.encode("utf-8") infodict["Overall"] = int(c.by_tag("td")[2].content.encode("utf-8")) infodict["Female"] = int(c.by_tag("td")[4].content.encode("utf-8")) infodict["Male"] = int(c.by_tag("td")[6].content.encode("utf-8")) if infodict["Overall"] > 80: infodict["fillKey"] = "HIGH" elif infodict["Overall"] > 70: infodict["fillKey"] = "ABVAVG" elif infodict["Overall"] > 60: infodict["fillKey"] = "AVG" elif infodict["Overall"] > 50: infodict["fillKey"] = "BELAVG" else: infodict["fillKey"] = "LOW" code = "" for countryCode in countryCodes: if infodict["name"] == countryCode[2]: code = countryCode[1] break # Als de code niet gevonden is, sla het land over if code == "": pass # Anders maak een key van de code met value de infodict van dat land else: pointsdict[code] = infodict json.dump(pointsdict, open("lifeexpectancy.json", "wb"))
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download()) for e in dom.by_class('titleColumn'): for href in e('a')[:1]: movie_urls.append("http://www.imdb.com" + href.attrs["href"]) # return the list of URLs of each movie's page on IMDB return movie_urls
# and stores it into a JSON object import re, json, io from pattern.web import URL, DOM, plaintext, strip_between from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT chindict = {} radicals = [] url = URL("http://www.mdbg.net/chindict/chindict.php?page=radicals") dom = DOM(url.download(cached=True)) radElements = dom('a.rad') for rad in radElements: radicals.append(rad.content) words = [] wordUrl = URL("http://www.mdbg.net/chindict/" + rad.attrs["href"]) wordDom = DOM(wordUrl.download(cached=True)) wordTable = wordDom.by_class("results") for word in wordTable[0].by_tag("span"): words.append(word.content) chindict[rad.content] = words with open('chindict.js', 'w') as outfile: json.dump(radicals, outfile) json.dump(chindict, outfile)
# Create csv and add a header row output = open("races_data.csv", "wb") writer = csv.writer(output) writer.writerow(["Race","Year", "POS", "Num", "rider ID", "Rider URL", "Rider01", "rider02" , "Machine", "Time", "Speed" ]) # Set up base URL and main URL. ERA 5 = 1991 - 2012 eras = ["1","2","3","4","5"] for era in eras: print "Era:" + era url = URL("http://www.iomtt.com/TT-Database/Events.aspx?meet_code=TT2012&era=" + era) text_url = "http://www.iomtt.com" # Get a hold of the dom and then Grab each Year's URL which is embeded on li tags. dom = DOM(url.download(cached=True)) years = dom.by_class("ttDatabasePipeSeparator floatleft")[0].by_tag("li") # Iterate over each year for year in years: #Print commands are useful to monitor progress. print("year:") print year.by_tag("a")[0].attributes.get('href','') #Find the current year's URL and download its DOM. new_url = URL(text_url + year.by_tag("a")[0].attributes.get('href','')) year_url = URL(new_url) year_dom = DOM(year_url.download(cached=True)) #races = year_dom.by_class("panelinner clearfix")[0].by_tag("ul")[0].by_tag("li") races_div = races = year_dom.by_class("ttDatabase")[0].by_class("panelinner")[1].by_tag("ul") if len(races_div) > 1: races = races_div[0].by_tag("li")
######################################## Test Techcrunch - https://techcrunch.com/ #################################### print("#" * 40, "Test Techcrunch", "#" * 40) url = URL("https://techcrunch.com/startups/") dom = DOM(url.download(cached=True)) for e in dom.by_tag("header.post-block__header")[:5]: for a in e.by_tag("h2.post-block__title")[:1]: print(plaintext(a.content)) for h in a.by_tag("a.post-block__title__link")[:1]: print(h.attrs["href"]) print("") print("\n") header = dom.by_class("river__title")[0] print(header.content) print("\n") title_image = dom.by_attr(name="msapplication-TileImage")[0] print(title_image.attrs['content']) print("\n") url = URL("https://techcrunch.com") dom = DOM(url.download(cached=True)) for k in dom.by_class("post-block__title__link"): print(k.content.strip()) print("") print("\n")
def getTitle(self, link): html = URL(link).download() body = DOM(html).body title = body.by_class("title-news")[0].content.strip() return title
writer.writerow(["CourseID", "InstructorCode", "InstructorName", "InstructorURL", "InstructorEmail"]) date_to_write = [] all_urls = ["http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_1%2C9&SEARCH_TERM=both", "http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_2%2C9&SEARCH_TERM=both", "http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_5%2C6&SEARCH_TERM=both"] for ind_url in all_urls: # DOM object for each concentration url = URL(ind_url) dom = DOM(url.download(cached=True)) # get main content containing all the courses main_content = dom.by_class("csearchresults") # get all the rows that have the course data all_data_rows = main_content[0].by_tag("tr") # loop through each row for ind_data_row in all_data_rows: if ind_data_row.attributes.get("class", "") == "" or ind_data_row.attributes.get("class", "") == "odd": all_columns = ind_data_row.by_tag("td") # ensure course is not cancelled if len(all_columns) > 1 and plaintext(str(all_columns[4])).find("Canceled") == -1: course_id = ""
def get_info(baseurl, out_filename, npages=200): output = open(out_filename, "w") w = writer.UnicodeWriter(output) # TODO: fix this header w.writerow( [ "Title", "Rating", "Calories (kcal)", "Cholesterol (mg)", "Fat (g)", "Protein (g)", "Fiber (g)", "Sodium (mg)", "Cook Time", "Ingredients", "Full Ingredients", ] ) for page in range(1, npages): try: url = URL(baseurl + "?Page=%d" % page) dom = DOM(url.download(cached=True)) links = dom.by_class("rectitlediv") # goes through the 20 recipes on a given page for index in range(len(links)): # print index # get the link name title = links[index].content.split("/recipe/")[1].split("/detail")[0] # download individual recipe rpage = URL(os.path.join(base, title, end)) pdom = DOM(rpage.download(cached=True)) # average rating value rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3] # list of nutrition elements nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList") nut_vals = [] for i in range(len(nut_list)): val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content nut_vals.append(val) nuts = "\t".join(nut_vals) # time needed to cook try: cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ") except: cook_hours = "0" try: cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ") except: cook_mins = "0" mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0])) # ingredients ## gets the block containing both the amount and the amount all_ings = pdom.by_attribute(itemprop="ingredients") ing_units = [] ing_vals = [] for ing_index in range(len(all_ings)): tmp_ing = all_ings[ing_index].by_id("lblIngName").content if " " in all_ings[ing_index].content: continue try: tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content except: tmp_amount = "" # LET THIS BE THE EMPTY CHAR we decide on ing_units.append(tmp_amount) ing_vals.append(tmp_ing) ings = ";".join(ing_vals) ing_units = [x + "|" for x in ing_units] str_ings = [str(x) for x in zip(ing_units, ing_vals)] str_ings = [x.replace(",", " ") for x in str_ings] full_ings = ";".join(str_ings) full_ings = ( full_ings.replace("u'", "") .replace("'", "") .replace(", u", "") .replace("(", "") .replace(")", "") .replace(" ", " ") ) assert len(ing_vals) == len(ing_units) w.writerow([title, rating, nuts, mins, ings, full_ings]) except: pass output.close()
self.writerow(row) # Creating the csv output file for writing into as well as defining the writer output = open("data_output_WIKI_EA.csv", "wb") writer = UnicodeWriter(output) # add header row writer.writerow(["State", "Rank", "EA", "Degree"]) # get the DOM object to scrape for links url = URL("http://en.wikipedia.org/wiki/List_of_U.S._states_by_educational_attainment") dom = DOM(url.download(cached=True)) # get the tables where all info is contained all_data_tables = dom.by_class("wikitable") # define the variable to store all the WIKI data all_wiki_data = [] # loop through each row for ind_data_table in all_data_tables: degree = "" for ind_data_header in ind_data_table.by_tag("th"): if "H.S. Graduate" in plaintext(ind_data_header.content): degree = "High School" if "Bachelor's Degree" in plaintext(ind_data_header.content): degree = "Undergraduate" if "Advanced Degree" in plaintext(ind_data_header.content):
def get_title_attributes(title, titleLink): url = URL(titleLink) dom = DOM(url.download(cached=True)) titleObj = Title(title.encode('ascii','replace')) print "Movie: ", title # Get Directors print "-> About to print directors... " directors = dom.by_attribute(itemprop="director")[0] directorNames = directors.by_tag("a") for director in directorNames: print director.content dirName = unicodedata.normalize('NFD', director.content).encode('ascii','replace') #str(director.content).encode("utf-8") print "Director ===> ", dirName titleObj.addDirectors( dirName ) # Get writers print "-> About to print writers... " try: writers = dom.by_attribute(itemprop="writer") for writer in writers: # print writer[1][1].content titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace')) except: pass print "--> About to get actors... " try: actors = dom.by_attribute(itemprop="actors" ) for actor in actors: # print actor[1][1].content titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace')) except: pass print "--> Aboutb to get rating information... " try: ratingsInfo = dom.by_class("star-box-giga-star") for rating in ratingsInfo: # print rating.content titleObj.addRating(str(rating.content).encode('ascii', 'replace')) except: pass print "--> About to print other stuff... " for item in dom.by_class("infobar"): try: objMatch = re.search("(\d+)", item.by_tag("time")[0].content ) if objMatch: # print objMatch.group(1) titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace')) except: pass for genreItem in item.by_tag("a"): try: objMatch = re.search("genre", genreItem.attributes['href'] ) if objMatch: titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace')) # print genreItem.attributes['href'] # print genreItem.content except: pass return titleObj
def scrape_movie_page(dom): ''' Scrape the IMDB page for a single movie Args: dom: pattern.web.DOM instance representing the page of 1 single movie. Returns: A list of strings representing the following (in order): title, year, duration, genre(s) (semicolon separated if several), director(s) (semicolon separated if several), writer(s) (semicolon separated if several), actor(s) (semicolon separated if several), rating, number of ratings. ''' # iterate all movies for p in movie_urls: p_url = URL(p) p_dom = DOM(p_url.download(cached=True)) # get title title = clean_unicode(p_dom.by_class('header')[0].content) title = plaintext(strip_between('<span', '</span>', title)) # get gengres genres = [] for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]: genres.append(clean_unicode(genre.content)) # make lists for info directors = [] writers = [] actors = [] # get directors writers actors text_blocks = p_dom.by_class('txt-block')[:3] for t in text_blocks: spans = t.by_tag('span') for s in spans: if s.attributes.get('itemprop') == 'director': director = s.by_tag('span')[0].by_tag('a')[0].content directors.append(clean_unicode(director)) if s.attributes.get('itemprop') == 'writer': p_writer = s.by_tag('span')[0].by_tag('a')[0].content writers.append(clean_unicode(p_writer)) if s.attributes.get('itemprop') == 'actors': actor = s.by_tag('span')[0].by_tag('a')[0].content actors.append(clean_unicode(actor)) rating = [] ratings_count = [] # get ratings spans = p_dom.by_class('star-box-details')[0].by_tag('span') for s in spans: if s.attributes.get('itemprop') == 'ratingValue': rating = clean_unicode(s.content) if s.attributes.get('itemprop') == 'ratingCount': ratings_count = clean_unicode(s.content) # format the strings from lists genres = concat_strings(genres) directors = concat_strings(directors) writers = concat_strings(writers) actors = concat_strings(actors) # Return everything of interest for this movie (all strings as specified # in the docstring of this function). return title, duration, genres, directors, writers, actors, rating, \ n_ratings
movieUrl = URL(movieTitleLinks.group(0)) movieDom = DOM(movieUrl.download(cached=True)) #======================================================================= # Get the title #======================================================================= for movie in movieDom.by_tag("title"): title = re.sub(' \(\d+\) - IMDb','', movie.content.encode('ascii','ignore').strip()) #======================================================================= # Get the runtime #======================================================================= for movie in movieDom.by_class("infobar"): time = re.search('\d+ min', movie.content.encode('ascii', 'ignore').strip()) runtime = re.sub(' min','', time.group(0)) #=================================================================== # Get the genres #=================================================================== genre = [] for g in movie.by_tag('a'): type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n')) if ((type != ' \n') and not (re.match('^\s+', type))): genre.append(type)
self.writerow(row) # Creating the csv output file for writing into as well as defining the writer output = open("restweek.csv", "wb") writer = UnicodeWriter(output) # add header row writer.writerow(["Name", "Neighborhood", "Cuisine", "Dining Style", "Meals Served", "Dress Code", "Ratings", "Price", "Phone Number", "Address", "Website" ]) # Get the DOM object to scrape for movie links. [Hint: Use absolute URL's. # Documentation can be found here: http://www.clips.ua.ac.be/pages/pattern-web] url = URL("http://www.opentable.com/promo.aspx?m=7&ref=470&pid=90") dom = DOM(url.download(cached=True)) for restaraunt in dom.by_class("ResultRow"): name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' ) neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' ) neihgborhood_cuisine = neighborhood_cuisine.split('|') neighborhood = neihgborhood_cuisine[0] cuisine = neihgborhood_cuisine[1] meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' ) meals = meals.split('<') # need to clean meals = meals[0] restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string)) restDOM = DOM(restURL.download(cached=True)) # need to clean address = restDOM.by_id("ProfileOverview_lblAddressText").content price = restDOM.by_id("ProfileOverview_lblPriceText").content try:
# http://www.mdbg.net/chindict/chindict.php?page=radicals # and stores it into a JSON object import re, json, io from pattern.web import URL, DOM, plaintext, strip_between from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT chindict = {} radicals = [] url = URL("http://www.mdbg.net/chindict/chindict.php?page=radicals") dom = DOM(url.download(cached=True)) radElements = dom('a.rad') for rad in radElements: radicals.append(rad.content) words = [] wordUrl = URL("http://www.mdbg.net/chindict/" + rad.attrs["href"]) wordDom = DOM(wordUrl.download(cached=True)) wordTable = wordDom.by_class("results") for word in wordTable[0].by_tag("span"): words.append(word.content) chindict[rad.content] = words with open('chindict.js', 'w') as outfile: json.dump(radicals, outfile) json.dump(chindict, outfile)
def get_mov_link(search_url, mov_title, mov_year, proxy): mov_url = URL(search_url) if mov_title == 'M': return "http://www.1channel.ch/watch-48002-M" if mov_title == u"8\u00BD": return "http://www.1channel.ch/watch-1188-8189" if mov_title == u"Nausica\u00E4 of the Valley of the Wind": return "http://www.1channel.ch/watch-998-Nausicaa-of-the-Valley-of-the-Winds" #try: mov_dom = DOM(mov_url.download(cached=False, timeout=25, proxy=proxy)) #print mov_dom # print "Downloaded search dom for:", mov_title, "(" + str(mov_year) + ")" #except Exception, e: # print "Could not download search url: ", mov_url,'for reason:', e mov_ind = mov_dom.by_class("index_container") #print search_url #print mov_ind[0].by_class("index_item index_item_ie")[0] if mov_ind[0].by_class("info_message"): print mov_title, "not found" return None else: for r in mov_ind[0].by_class("index_item index_item_ie"): #grab the search results title res_title = r.by_tag("a")[0].attributes.get("title") #split out the year based on "(\d+)", assign title to res_t and year to res_y res_ts = re.search("Watch (.+)\s\((\d+)", res_title) res_t = res_ts.group(1) res_y = res_ts.group(2) if mov_title == 'The Good, the Bad and the Ugly': mov_year = 1967 if mov_title == 'The Dark Knight': mov_title = 'Batman: The Dark Knight' if mov_title == "One Flew Over the Cuckoo's Nest": mov_year = 1976 if mov_title == 'Star Wars': mov_title ='Star Wars: Episode IV - A New Hope' if mov_title == 'Seven Samurai': mov_year = 1956 if mov_title == 'Once Upon a Time in the West': mov_title = "Once Upon a Time in the West - (C'era una volta il West)" if mov_title == 'Casablanca': mov_year = 1943 if mov_title == 'Rear Window': mov_year = 1955 if mov_title == "It's a Wonderful Life": mov_year = 1947 if mov_title == "The Pianist": mov_year = 2003 if mov_title == u'L\xe9on: The Professional': mov_title = "Leon The Professional" if mov_title == u"Am\xe9lie": mov_title = "Amelie from Montmartre" if mov_title == "Princess Mononoke": mov_title = "Princess Mononoke (Mononoke-hime)" if mov_title == "Witness for the Prosecution": mov_year = 1958 if mov_title == 'Grave of the Fireflies': mov_title = "Grave of the Fireflies (Hotaru no haka)" if mov_title == 'Snatch.': mov_title = "Snatch" mov_year = 2001 if mov_title == 'The General': mov_year = 1927 if mov_title == 'Gran Torino': mov_year = 2009 if mov_title == 'Hotel Rwanda': mov_year = 2005 if mov_title == 'V for Vendetta': mov_year = 2006 # Foreign title if mov_title == "The Secret in Their Eyes": mov_title = "El secreto de sus ojos" if mov_title == "There Will Be Blood": mov_year = 2008 if mov_title == "Million Dollar Baby": mov_year = 2005 if mov_title == "Amores Perros": mov_title = "Amores perros" if mov_title == "Life of Pi": mov_title = "Life Of PI" if mov_title == "The 400 Blows": mov_title = "The 400 Blows (Les quatre cents coups)" if mov_title == "Howl's Moving Castle": mov_title = "Howl's Moving Castle (Hauru no ugoku shiro)" if mov_title == "La strada": mov_title = "La Strada" if mov_title == "The Wild Bunch": mov_title = "The Wild Bunch (1969)" if mov_title == "A Fistful of Dollars": mov_title = "A Fistful of Dollars - (Per un pugno di dollari)" if mov_title == "Slumdog Millionaire": mov_year = 2009 if mov_title == "Stalker": mov_year = 1980 if mov_title == "Harry Potter and the Deathly Hallows: Part 2": mov_title = "Harry Potter and the Deathly Hallows 2" if mov_title == "The Wrestler": mov_year = 2009 if mov_title == "Spring, Summer, Fall, Winter... and Spring": mov_title = "Spring, Summer, Fall, Winter...and Spring (Bom yeoreum gaeul gyeoul geurigo bom)" if mov_title == "Castle in the Sky": mov_title = "Castle in The Sky" print res_t, res_y, mov_title.strip(), mov_year if res_t.strip() == mov_title.strip() and int(res_y) == int(mov_year): return abs_url(r.by_tag("a")[0].attributes.get("href"),base=url.redirect or url.string)
for i in range(0, len(tvseries)): writer.writerow(tvseries[i]) f.close() if __name__ == '__main__': # Download the HTML file url = URL(TARGET_URL) html = url.download() # Save a copy to disk in the current directory, this serves as an backup # of the original HTML, will be used in grading. with open(BACKUP_HTML, 'wb') as f: f.write(html) # Parse the HTML file into a DOM representation dom = DOM(html) # Extract the tv series (using the function you implemented) tvseries = [] count = 0 for i in dom.by_class("lister-item-content"): tvseries.append(extract_tvseries(dom,count)) count = count + 1 # Write the CSV file to disk (including a header) with open(OUTPUT_CSV, 'wb') as output_file: save_csv(output_file, tvseries) print tvseries
url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history' url = URL(url_string) dom = DOM(url.download(cached=True)) engine = Wikipedia(license=None) article = engine.search('Ocaml') a = 0 while (len(dom.by_class("mw-nextlink")) > 0): page_history_links = dom.by_tag("ul")[0].by_tag("li") for link in page_history_links: date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore') bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore') ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true' req = urllib2.urlopen(ip_url) req_request = urllib2.Request(ip_url) #handler = urllib2.urlopen(req) # print "this must be the code in bytes\n" # print req.read() read_ip_data = HTMLParser.HTMLParser().unescape(req.read()).encode('ascii', 'ignore') if read_ip_data.split()[0] != '<html>': print "the date\n" print date
if (isinstance(stringToConvert, unicode)): stringToConvert = unicodedata.normalize('NFKD', stringToConvert).encode('ascii','ignore') return stringToConvert # Create csv and add a header row output = open("races_data.csv", "wb") writer = csv.writer(output) writer.writerow(["Race","Year", "POS", "Num", "rider ID", "Rider URL", "Rider01", "rider02" , "Machine", "Time", "Speed" ]) # Set up base URL and main URL. ERA 5 = 1991 - 2012 url = URL("http://www.iomtt.com/TT-Database/Events.aspx?meet_code=TT2012&era=5") text_url = "http://www.iomtt.com" # Get a hold of the dom and then Grab each Year's URL which is embeded on li tags. dom = DOM(url.download(cached=True)) years = dom.by_class("ttDatabasePipeSeparator floatleft")[0].by_tag("li") # Iterate over each year for year in years: #Print commands are useful to monitor progress. print("") print year.by_tag("a")[0].attributes.get('href','') #Find the current year's URL and download its DOM. new_url = URL(text_url + year.by_tag("a")[0].attributes.get('href','')) year_url = URL(new_url) year_dom = DOM(year_url.download(cached=True)) races = year_dom.by_class("grid_10 alpha hideoverflow")[0].by_tag("li") # The first 22 URLs belong to the year and this is consistent across the site so those URLs will be skipped. for race in races[22:-4]:
# Creating the csv output file for writing into as well as defining the writer output = open("data_output_WIKI_EA.csv", "wb") writer = UnicodeWriter(output) # add header row writer.writerow(["State", "Rank", "EA", "Degree"]) # get the DOM object to scrape for links url = URL( "http://en.wikipedia.org/wiki/List_of_U.S._states_by_educational_attainment" ) dom = DOM(url.download(cached=True)) # get the tables where all info is contained all_data_tables = dom.by_class("wikitable") # define the variable to store all the WIKI data all_wiki_data = [] # loop through each row for ind_data_table in all_data_tables: degree = "" for ind_data_header in ind_data_table.by_tag("th"): if "H.S. Graduate" in plaintext(ind_data_header.content): degree = "High School" if "Bachelor's Degree" in plaintext(ind_data_header.content): degree = "Undergraduate" if "Advanced Degree" in plaintext(ind_data_header.content):
urls = [ "http://www.campusservices.harvard.edu/transit-fleet/evening-nights-monday-friday", "http://www.campusservices.harvard.edu/transit-fleet/morning-afternoon-monday-friday", "http://www.campusservices.harvard.edu/transit-fleet/weekends", "http://www.campusservices.harvard.edu/transit-fleet/overnight-service" ] count = 1 for urlStr in urls: url = URL(urlStr) dom = DOM(url.download(cached=False)) print "Parsing " + urlStr body = dom.by_class("field-items")[0] tables = body.by_tag("table") headers = body.by_tag("h2") for i, table in enumerate(tables): writer = Writer(str(count) + ".csv") header = headers[i].content print "" print "Parsing route " + str(header) writer.writeLine(urlStr.split("/")[-1]) writer.writeLine(header) header = True for row in table.by_tag("tr"): #lst = [] for cell in row.by_tag("td"): if (header and len(cell.by_tag("strong")) != 0):
def fetchWeatherDataForAirportAndYear(airport, year, month): try: # print urlForAirportAndYear(airport, year, month) url = URL(urlForAirportAndYear(airport, year, month)) dom = DOM(url.download(cached=True)) avg_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[3].by_class("b") if avg_temp: avg_temp = avg_temp[1].content else: avg_temp = "" avg_max_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[2].by_class("b") if avg_max_temp: avg_max_temp = avg_max_temp[1].content else: avg_max_temp = "" avg_min_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[4].by_class("b") if avg_min_temp: avg_min_temp = avg_min_temp[1].content else: avg_min_temp = "" avg_dew_point = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[10].by_class("b") if avg_dew_point: avg_dew_point = avg_dew_point[1].content else: avg_dew_point = "" avg_precipitation = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[12].by_class("b") if avg_precipitation[1]: avg_precipitation = avg_precipitation[1].content else: avg_precipitation = "" avg_wind = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[15].by_class("b") if avg_wind: avg_wind = avg_wind[1].content else: avg_wind = "" avg_gust_wind = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[16].by_class("b") if avg_gust_wind: avg_gust_wind = avg_gust_wind[1].content else: avg_gust_wind = "" avg_sea_level_pressure = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[18].by_class("b") if avg_sea_level_pressure: avg_sea_level_pressure = avg_sea_level_pressure[1].content else: avg_sea_level_pressure = "" # print "good" return {"avg_temp": avg_temp, "avg_min_temp": avg_min_temp, "avg_max_temp": avg_max_temp, "avg_dew_point": avg_dew_point, "avg_precipitation": avg_precipitation, "avg_wind": avg_wind, "avg_gust_wind": avg_gust_wind, "avg_sea_level_pressure": avg_sea_level_pressure } except URLTimeout: # print "timeout fetching data" return {} except: # print "unknown error fetching data" return {}
######################################## Test Techcrunch - https://techcrunch.com/ #################################### print("#"*40, "Test Techcrunch", "#"*40) url = URL("https://techcrunch.com/startups/") dom = DOM(url.download(cached=True)) for e in dom.by_tag("header.post-block__header")[:5]: for a in e.by_tag("h2.post-block__title")[:1]: print(plaintext(a.content)) for h in a.by_tag("a.post-block__title__link")[:1]: print(h.attrs["href"]) print("") print("\n") header = dom.by_class("river__title")[0] print(header.content) print("\n") title_image = dom.by_attr(name="msapplication-TileImage")[0] print(title_image.attrs['content']) print("\n") url = URL("https://techcrunch.com") dom = DOM(url.download(cached=True)) for k in dom.by_class("post-block__title__link"): print(k.content.strip()) print("")
# I use team_labels[0] but it doesn't matter because the labels are the same # get "th"'s b/c the elements look like this: <th align="left" class="tooltip sort_default_asc" >Franchise</th> all_labels = team_labels[0].by_tag("th") team_label_container=[] for label in all_labels: team_label_container.append(label.content.encode("utf8")) # team_label_container now has the headers team_label_container.insert(0,'team_acronym') team_container=[] team_container.append(team_label_container) #Now get the statistics. teams = dom.by_class("full_table") for team in teams: this_team_container=[] for td in team.by_tag("td"): #http://stackoverflow.com/questions/2365411/python-convert-unicode-to-ascii-without-errors this_team_container.append(td.content.encode("utf8")) # original comes out like this: ['<a href="/teams/ANA/">Anaheim Ducks</a>', 'NHL',...] #split the first element to create two entries: 1. for the URL and 2. the other for franchise team_acronym = this_team_container[0][this_team_container[0].find("/teams")+7:this_team_container[0].find("/teams")+10] franchise = this_team_container[0].split(">")[1][0:len(this_team_container[0].split(">")[1])-3] #now remove the <a href> element and replace with the cleaned acronym and franchise this_team_container.pop(0) this_team_container.insert(0,team_acronym) this_team_container.insert(1,franchise)
# Creating the csv output file for writing into as well as defining the writer timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') output = open("output/information_management_systems_requirements_"+timestamp+".csv", "wb") writer = UnicodeWriter(output) # Get the DOM object. url = URL("http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_2%2C9&SEARCH_TERM=both") dom = DOM(url.download(cached=True)) # add 1st header row writer.writerow(["Term", "CourseNumber", "Title", "Instructor", "Day", "Time", "Location", "CourseType", "EnrollLimit", "Attributes"]) date_to_write = [] # get main content containing all the courses main_content = dom.by_class("csearchresults") # get all the rows that have the course data all_data_rows = main_content[0].by_tag("tr") # loop through each row for ind_data_row in all_data_rows: if ind_data_row.attributes.get("class", "") == "" or ind_data_row.attributes.get("class", "") == "odd": all_columns = ind_data_row.by_tag("td") # ensure course is not cancelled if len(all_columns) > 1 and plaintext(str(all_columns[4])).find("Canceled") == -1: term = ""
# Ratings # Number of Ratings page_urls = [] tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr') for tr in tableRows[1:]: a = tr.by_tag('a')[0] page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string))) for p in page_urls: p_url = URL(p) p_dom = DOM(p_url.download(cached=True)) title = clean_unicode(p_dom.by_class('header')[0].content) title = plaintext(strip_between('<span', '</span>', title)) runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content) genres = [] for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]: genres.append(clean_unicode(genre.content)) directors = [] writers = [] actors = [] text_blocks = p_dom.by_class('txt-block')[:3] for t in text_blocks: spans = t.by_tag('span')
presto_schools = ["Harvard", "Yale", "Brown"] # download presto sports data # for each school for school in range(0,3): # for each year for presto_year in presto_years: print presto_schools[school] + " " + presto_year # generate url string, download dom base_url = presto_urls[school] url_string = base_url + presto_year + "/roster" url = URL(url_string) dom = DOM(url.download(cached=True)) print "Downloaded." print "-----------------------" rows = (dom.by_class("roster-row0") + dom.by_class("roster-row1")) # go through rows of swimmers for row in rows: # adjustment for Brown 2011-12 (has an extra column), # and for Brown and Harvard in general because year columns are in different place adj = 0 if school == 2 and presto_year == "2011-12": adj = 1 elif school == 0 or school == 2: adj = -1 cells = row.by_tag("td") # skip divers if cells[1 - adj].content.strip() == "Diving": continue year_name = cells[2 + adj].content.strip() grad_year = 0
# To get you started, uncomment the following print line and see the output for the first entry #print dom.by_class("title")[0].by_tag("a")[0].content # by_class selects all with class="title" and returns a list. Familiarize yourself with the DOM # by trying out different combinations. See what each returns. # NOTE: if you see u' in front of your strings, you can use use encode( 'ascii', 'ignore' ) on your string # to learn why, you can optionally read up on http://docs.python.org/2/howto/unicode.html # You could start with this # add header row writer.writerow(["Title", "Ranking", "Genre", "Actors", "Runtime"]) allElements = dom.by_class("title") for i,e in enumerate(allElements): # WRITE YOUR CODE HERE for title in e.by_tag('a')[:1]: title = plaintext(title.content.encode('ascii','ignore')) for rank in e.by_class('value')[:1]: rank = plaintext(rank.content.encode('ascii','ignore')) for genre in e.by_class('genre')[:1]: genre = re.sub(" \| ", ", ", plaintext(genre.content.encode('ascii','ignore'))) for actors in e.by_class('credit')[:1]: actors = re.sub("^With: ", "", plaintext(actors.content.encode('ascii','ignore')))