type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n')) if ((type != ' \n') and not (re.match('^\s+', type))): genre.append(type) genresStr = ';'.join(genre) #======================================================================= # Get the directors #======================================================================= directors = [] for movie in movieDom.by_attribute(itemprop="director"): # Get rid of the html tags dir = re.sub('<[a-zA-Z\/][^>]*>','', movie.content.encode('ascii','ignore').lstrip('\r\n')) # Get rid of new line dirs = re.sub('\n', '', dir) # Directors for other movies have leading spaces - don't add them if not re.match('^\s+', dirs): directors.append(dirs) directorsStr = ';'.join(directors) #=======================================================================
def get_info(baseurl, out_filename, npages=200): output = open(out_filename, "w") w = writer.UnicodeWriter(output) # TODO: fix this header w.writerow( [ "Title", "Rating", "Calories (kcal)", "Cholesterol (mg)", "Fat (g)", "Protein (g)", "Fiber (g)", "Sodium (mg)", "Cook Time", "Ingredients", "Full Ingredients", ] ) for page in range(1, npages): try: url = URL(baseurl + "?Page=%d" % page) dom = DOM(url.download(cached=True)) links = dom.by_class("rectitlediv") # goes through the 20 recipes on a given page for index in range(len(links)): # print index # get the link name title = links[index].content.split("/recipe/")[1].split("/detail")[0] # download individual recipe rpage = URL(os.path.join(base, title, end)) pdom = DOM(rpage.download(cached=True)) # average rating value rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3] # list of nutrition elements nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList") nut_vals = [] for i in range(len(nut_list)): val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content nut_vals.append(val) nuts = "\t".join(nut_vals) # time needed to cook try: cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ") except: cook_hours = "0" try: cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ") except: cook_mins = "0" mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0])) # ingredients ## gets the block containing both the amount and the amount all_ings = pdom.by_attribute(itemprop="ingredients") ing_units = [] ing_vals = [] for ing_index in range(len(all_ings)): tmp_ing = all_ings[ing_index].by_id("lblIngName").content if " " in all_ings[ing_index].content: continue try: tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content except: tmp_amount = "" # LET THIS BE THE EMPTY CHAR we decide on ing_units.append(tmp_amount) ing_vals.append(tmp_ing) ings = ";".join(ing_vals) ing_units = [x + "|" for x in ing_units] str_ings = [str(x) for x in zip(ing_units, ing_vals)] str_ings = [x.replace(",", " ") for x in str_ings] full_ings = ";".join(str_ings) full_ings = ( full_ings.replace("u'", "") .replace("'", "") .replace(", u", "") .replace("(", "") .replace(")", "") .replace(" ", " ") ) assert len(ing_vals) == len(ing_units) w.writerow([title, rating, nuts, mins, ings, full_ings]) except: pass output.close()
def get_title_attributes(title, titleLink): url = URL(titleLink) dom = DOM(url.download(cached=True)) titleObj = Title(title.encode('ascii','replace')) print "Movie: ", title # Get Directors print "-> About to print directors... " directors = dom.by_attribute(itemprop="director")[0] directorNames = directors.by_tag("a") for director in directorNames: print director.content dirName = unicodedata.normalize('NFD', director.content).encode('ascii','replace') #str(director.content).encode("utf-8") print "Director ===> ", dirName titleObj.addDirectors( dirName ) # Get writers print "-> About to print writers... " try: writers = dom.by_attribute(itemprop="writer") for writer in writers: # print writer[1][1].content titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace')) except: pass print "--> About to get actors... " try: actors = dom.by_attribute(itemprop="actors" ) for actor in actors: # print actor[1][1].content titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace')) except: pass print "--> Aboutb to get rating information... " try: ratingsInfo = dom.by_class("star-box-giga-star") for rating in ratingsInfo: # print rating.content titleObj.addRating(str(rating.content).encode('ascii', 'replace')) except: pass print "--> About to print other stuff... " for item in dom.by_class("infobar"): try: objMatch = re.search("(\d+)", item.by_tag("time")[0].content ) if objMatch: # print objMatch.group(1) titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace')) except: pass for genreItem in item.by_tag("a"): try: objMatch = re.search("genre", genreItem.attributes['href'] ) if objMatch: titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace')) # print genreItem.attributes['href'] # print genreItem.content except: pass return titleObj