示例#1
0
 def parse_project_description(self, root):
     for node in root.xpath("//br"):
         node.tail = (node.tail or "") + "\n"
     text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False)
     text = text.split(u"Posted On")[0].strip()
     text = text.split(u"Budget :")[0].strip()
     return text
示例#2
0
 def parse_project_description(self, root):
     for node in root.xpath('//br'):
         node.tail = (node.tail or '') + '\n'
     text = strip_tags(decode_entities(render_html(root,
                                                   encoding='unicode')),
                       normalize_space=False)
     text = text.split(u'Category:')[0].strip()
     return text
示例#3
0
文件: feed.py 项目: Kuznitsin/grab
def parse_entry(entry, feed, teaser_size):
    details = {
        'url': entry.link,
        'title': strip_tags(entry.title),
        'content': build_entry_content(entry),
        'teaser': build_entry_content(entry, teaser=True, teaser_size=teaser_size),
        'date': parse_entry_date(entry),
        'tags': parse_entry_tags(entry),
    }

    guid_token = (entry.get('id') or entry.link).encode('utf-8')
    details['guid']  = sha1(guid_token).hexdigest()

    if not details['date']:
        raise Exception('Entry %s does not has publication date' % entry.link)

    return details
示例#4
0
文件: feed.py 项目: sergithon/grab
def parse_entry(entry, feed, teaser_size):
    details = {
        'url': entry.link,
        'title': strip_tags(entry.title),
        'content': build_entry_content(entry),
        'teaser': build_entry_content(entry,
                                      teaser=True,
                                      teaser_size=teaser_size),
        'date': parse_entry_date(entry),
        'tags': parse_entry_tags(entry),
    }

    guid_token = (entry.get('id') or entry.link).encode('utf-8')
    details['guid'] = sha1(guid_token).hexdigest()

    if not details['date']:
        raise Exception('Entry %s does not has publication date' % entry.link)

    return details
    def task_initial(self, grab, task):
        brand = ''
        store_number = ''
        places = grab.xpath_list('//div[@id="circular-stores"]/div')
        for place in places:
            brand = place[0].text_content()
            address = place.find('div[@class="store-title"]').text_content()
            city = place.find('div/span[@class="store-city"]').text_content()
            state = place.find('div/span[@class="store-state"]').text_content()
            zip = place.find('div/span[@class="store-zipcode"]').text_content()
            phone = place.find('div[@class="store-phone"]').text_content()
            store_number = place.attrib['class'].split('-')[-1]

            link = task.rss_url.format(store_number)
            feed = feedparser.parse(link)

            for item in feed['items']:
                product = ''
                description = ''
                price = ''
                saving = ''
                valid_from = ''
                valid_to = ''

                try:
                    product = item['title']
                except Exception:
                    pass
                try:
                    description = html.strip_tags(item['description'])
                except Exception:
                    pass
                try:
                    price = item['vertis_price']
                except Exception:
                    pass
                try:
                    saving = item['vertis_moreprice']
                except Exception:
                    pass
                try:
                    valid_from = item['vertis_psdate']
                    valid_from = datetime.strptime(' '.join(valid_from.split(' ')[:-1]), '%a, %d %B %Y %H:%M:%S')
                    valid_from = valid_from.strftime('%d/%m/%Y')
                except Exception:
                    pass
                try:
                    valid_to = item['vertis_edate']
                    valid_to = datetime.strptime(' '.join(valid_to.split(' ')[:-1]), '%a, %d %B %Y %H:%M:%S')
                    valid_to = valid_to.strftime('%d/%m/%Y')
                except Exception:
                    pass

                image = ''
                try:
                    image_link = item['vertis_itemlargeimage']
                    base_name = os.path.join(IMAGE_DIR, table_name, brand, sha1(image_link).hexdigest()+'.jpg')
                    # image = sys.path.join([IMAGE_DIR, brand, base_name])
                    image = base_name
                    self.add_task(Task(name='save_image', url=image_link, image_name=image))
                except Exception:
                    pass

                data = Data(store_number.encode('utf-8'), product.encode('utf-8'), description.encode('utf-8'), price.encode('utf-8'),
                            saving.encode('utf-8'), valid_from, valid_to, image)

                session.add(data)
                session.commit()