def parse_project_description(self, root): for node in root.xpath("//br"): node.tail = (node.tail or "") + "\n" text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False) text = text.split(u"Posted On")[0].strip() text = text.split(u"Budget :")[0].strip() return text
def parse_project_description(self, root): for node in root.xpath('//br'): node.tail = (node.tail or '') + '\n' text = strip_tags(decode_entities(render_html(root, encoding='unicode')), normalize_space=False) text = text.split(u'Category:')[0].strip() return text
def parse_entry(entry, feed, teaser_size): details = { 'url': entry.link, 'title': strip_tags(entry.title), 'content': build_entry_content(entry), 'teaser': build_entry_content(entry, teaser=True, teaser_size=teaser_size), 'date': parse_entry_date(entry), 'tags': parse_entry_tags(entry), } guid_token = (entry.get('id') or entry.link).encode('utf-8') details['guid'] = sha1(guid_token).hexdigest() if not details['date']: raise Exception('Entry %s does not has publication date' % entry.link) return details
def task_initial(self, grab, task): brand = '' store_number = '' places = grab.xpath_list('//div[@id="circular-stores"]/div') for place in places: brand = place[0].text_content() address = place.find('div[@class="store-title"]').text_content() city = place.find('div/span[@class="store-city"]').text_content() state = place.find('div/span[@class="store-state"]').text_content() zip = place.find('div/span[@class="store-zipcode"]').text_content() phone = place.find('div[@class="store-phone"]').text_content() store_number = place.attrib['class'].split('-')[-1] link = task.rss_url.format(store_number) feed = feedparser.parse(link) for item in feed['items']: product = '' description = '' price = '' saving = '' valid_from = '' valid_to = '' try: product = item['title'] except Exception: pass try: description = html.strip_tags(item['description']) except Exception: pass try: price = item['vertis_price'] except Exception: pass try: saving = item['vertis_moreprice'] except Exception: pass try: valid_from = item['vertis_psdate'] valid_from = datetime.strptime(' '.join(valid_from.split(' ')[:-1]), '%a, %d %B %Y %H:%M:%S') valid_from = valid_from.strftime('%d/%m/%Y') except Exception: pass try: valid_to = item['vertis_edate'] valid_to = datetime.strptime(' '.join(valid_to.split(' ')[:-1]), '%a, %d %B %Y %H:%M:%S') valid_to = valid_to.strftime('%d/%m/%Y') except Exception: pass image = '' try: image_link = item['vertis_itemlargeimage'] base_name = os.path.join(IMAGE_DIR, table_name, brand, sha1(image_link).hexdigest()+'.jpg') # image = sys.path.join([IMAGE_DIR, brand, base_name]) image = base_name self.add_task(Task(name='save_image', url=image_link, image_name=image)) except Exception: pass data = Data(store_number.encode('utf-8'), product.encode('utf-8'), description.encode('utf-8'), price.encode('utf-8'), saving.encode('utf-8'), valid_from, valid_to, image) session.add(data) session.commit()