def _set_last_ts(self, date): date_str = (utils.convert_date_to_str(date) if isinstance(date, datetime) else date) if not date_str: return with open(self.last_seen_filename, 'wb') as f: f.write(date_str)
def test_last_ts_prop(self): # check that on creation no ts is set self.assertIsNone(self.spider.last_ts) date = datetime.datetime.utcnow() date_str = utils.convert_date_to_str(date) # check that both datetime object and date_str can be set self.spider.last_ts = date self.assertEqual(datetime.datetime, type(self.spider.last_ts)) self.spider.last_ts = date_str self.assertEqual(datetime.datetime, type(self.spider.last_ts)) # last_ts is set after crawl job and the file exists as well self.assertIsNotNone(self.spider.last_ts) self.assertTrue(os.path.exists(self.spider.last_seen_filename))
def _process_comments(response): data = json.loads(response.body) posts_data = data["response"]["comments"][1:] for post in posts_data: item = postscraper.items.PostItem() item['date'] = utils.convert_date_to_str( datetime.fromtimestamp(post['date'])) item['text'] = post['text'] item['title'] = ("Board post from %s" % item['date']) item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" % {'group': abs(self.owner_id), 'id': "%s_%s" % (abs(self.owner_id), post['id'])}) item['author'] = ("http://vk.com/" + ('id%s' % post['from_id'] if post['from_id'] > 0 else 'club%s' % abs(post['from_id']))) yield item
def close_spider(self, spider): """Sends an email with new items if any""" items = self._filter_by_query(spider) # don't generate an email with 0 results if len(items) == 0: spider.email = None return env = Environment(loader=FileSystemLoader(settings.TEMPLATES_DIR)) template = env.get_template('mail_items.html') body = template.render(items=items, query=settings.QUERY) # save email body in a file date = ("the very beginning" if not spider.last_ts else utils.convert_date_to_str(spider.last_ts)) text = ("<h1>" "%(count)s new items from %(link)s since %(date)s</h1>\n" "%(body)s" % {'count': len(items), 'link': spider.name, 'date': date, 'body': body}) spider.email = text
def _parse_vk_wall(self, response): """Deals with wall posts' json data received from VK API""" if response.status != 200: LOG.info("200 OK expected, got %s" % response.status) raise exc.SpiderException("Response code not supported: %s" % response.status) data = json.loads(response.body) # FIXME code duplication if "error" in data: raise exc.SpiderException("%(name)s spider failed: %(reason)s" % {"reason": data["error"]["error_msg"], "name": self.name}) posts_data = data["response"][1:] for post in posts_data: item = postscraper.items.PostItem() if post['text'] == '': # a repost of some kind try: item['text'] = ("%(title)s\n%(description)s" % { 'description': post['attachment']['link']['description'], 'title': post['attachment']['link']['title']}) item['link'] = post['attachment']['link']['url'] except (KeyError, ValueError): continue else: # a native post item['text'] = post['text'] item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" % {'group': abs(self.owner_id), 'id': "%s_%s" % (abs(self.owner_id), post['id'])}) item['date'] = utils.convert_date_to_str( datetime.fromtimestamp(post['date'])) item['title'] = ("Wall post from %s" % item['date']) item['author'] = ("http://vk.com/" + ('id%s' % post['from_id'] if post['from_id'] > 0 else 'club%s' % abs(post['from_id']))) yield item