def parse_buddy(response): item = BuddyItem() item['id'] = common_lib.get_page_id(response) item['user_id'] = common_lib.get_user_id(response) buddies = response.xpath('//b/a[@class="member"]') item['buddies'] = [ b.xpath('normalize-space(text())').extract() for b in buddies ] return item
def parse_food_diary(response): item = FoodDiary() item['id'] = common_lib.get_page_id(response) item['user_id'] = common_lib.get_user_id(response) item['link'] = response.url item['date'] = response.xpath('normalize-space(\ //div[@class="subtitle"]/text())').extract() general_info = response.xpath('//table[@class="foodsNutritionTbl"]\ //td[@class="sub"]/text()').extract() if len(general_info) < 4: logging.log(logging.WARNING, "Could not found general food info \ on page " + response.url) else: item['food'] = { 'fat': general_info[0], 'carbs': general_info[1], 'prot': general_info[2], 'cals': general_info[3] } dishes_xpath = response.xpath('//table\ [@class="generic foodsNutritionTbl"]//tr[@valign="top"]') dishes = [] for d in dishes_xpath: dish_name = d.xpath('.//b/text()').extract() dish_info = d.xpath('.//td[@class="normal"]') if len(dish_info) < 4: logging.log(logging.WARNING, "Could not find info for food %s @ url %s" % (dish_name[0], response.url)) continue dish = { 'name': dish_name, 'fat': dish_info[0].xpath('text()').extract(), 'carbs': dish_info[1].xpath('text()').extract(), 'prot': dish_info[2].xpath('text()').extract(), 'cals': dish_info[3].xpath('text()').extract() } dishes.append(dish) item['dishes'] = dishes item['rdi'] = response.xpath('normalize-space(//div[@class="big"]\ /text())').extract() return item
def parse_exercise_diary(response): item = ExerciseDiary() item['id'] = common_lib.get_page_id(response) item['user_id'] = common_lib.get_user_id(response) item['link'] = response.url item['date'] = response.xpath('normalize-space(\ //div[@class="subtitle"]/text())').extract() summary_data = response.xpath('\ //table[@class="generic activityValuesTbl"]\ //td[@class="sub"]/text()').extract() if len(summary_data) < 2: logging.log(logging.WARNING, "Exercise summary data not found for\ page %s." % response.url) else: item['summary'] = { 'time_spent': summary_data[0], 'calc': summary_data[1] } exercises_xpath = response.xpath('//tr[starts-with(@id, "infsec")]') exercises = [] for e in exercises_xpath: time_spent_1 = e.xpath('.//div[@class=" activityCell bTop"]\ /text()').extract() time_spent_2 = e.xpath('.//div[@class="activityCell bLeft bTop"]\ /a/b/text()').extract() time_spent_3 = e.xpath('.//div[@class="activityCell bTop"]\ /text()').extract() time_spent = (time_spent_1 if time_spent_1 else (time_spent_2 if time_spent_2 else time_spent_3)) exercise = { 'name': e.xpath('.//b/text()').extract()[0], 'time_spent': time_spent[0], 'cals': e.xpath('.//div[@class="activityCell bTop bRight"]\ /text()').extract()[0] } exercises.append(exercise) item['exercises'] = exercises return item
def parse_post(response): item = PostItem() item['id'] = common_lib.get_page_id(response) item['link'] = response.url item['user_id'] = common_lib.get_user_id(response) item['date'] = response.xpath('//div[@class="breadcrumb_noLink"]/\ text()').extract() # textual fields content = response.xpath('//table[@class="generic breakout"]') item['text'] = content.xpath('normalize-space(tr/td/div[2]/text())')\ .extract() item['weight'] = { 'current': content.xpath('tr/td/div[3]/table/tr/td[2]\ /span[1]/text()').extract(), 'lost_sofar': content.xpath('tr/td/div[3]/table/tr/\ td[2]/span[2]/b/text()').extract() } diet_status = content.xpath('normalize-space(tr/td/div[3]\ /table/tr/td[2]/text()[4])').extract() item['diet'] = { 'status': diet_status[0].strip() if diet_status else None, 'name': content.xpath('normalize-space(tr/td/div[3]\ //div[@class="smallText"][2]/a/text())').extract() } # comments comments_xpath = response.xpath('//tr[@class="listrow"]/td') comments = [] for comment_xpath in comments_xpath: comment = { 'user_id': comment_xpath.xpath('div[2]/a/text()').extract(), 'date': " ".join(comment_xpath.xpath('normalize-space(div[2]\ /text())').extract()[0].split()[:3]), 'text': comment_xpath.xpath('normalize-space(div[1]/text())') .extract()[0] } comments.append(comment) item['comments'] = comments # calendar entry kcal = response.xpath('//table[@class="generic"][3]//a[1]\ /text()').extract() food_status = response.xpath('normalize-space(//\ table[@class="generic"]/tr/td[@class="smallText"][2]\ /text())').extract() food_info = None if food_status and food_status[0]: food_info = re.search(( "Fat: (\d+\.\d+\S+) \| " "Prot: (\d+\.\d+\S+) \| " "Carb: (\d+\.\d+\S+)\."), food_status[0]) food_text = response.xpath('normalize-space(//table\ [@class="generic breakout"]//tr[@valign="top"][1])').extract() item['food'] = { 'calories': kcal[0] if kcal else None, 'fat': food_info.group(1) if food_info else None, 'prot': food_info.group(2) if food_info else None, 'carb': food_info.group(3) if food_info else None, 'text': food_text[0].replace(unichr(160), " ") if food_text else None } exercise_text = response.xpath('normalize-space(//table\ [@class="generic breakout"]//tr[@valign="top"][2])').extract() item['exercise'] = { 'calories': response.xpath('//table[@class="generic"][4]/tr/td\ /a/text()').extract(), 'text': exercise_text[0].replace(unichr(160), " ") if exercise_text else None } # likes base_url = 'http://www.fatsecret.com/ajax/FeedSupporters.aspx' params = {'id': item['id'], 'tid': '2'} import requests r = requests.get(base_url, params) item['likes'] = re.findall('>(\S+)<\/a>', r.content) # TODO include html # item['html'] = response.body return item