Пример #1
0
 def _process_article(self, source, rsp):
     try:
         if rsp is None:
             return
         if not rsp:
             info = 'download error: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         if not rsp.content:
             info = 'rsp no content: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         data = parse_data(rsp.content)
         if not data.get('content'):
             return
         data['origin_url'] = rsp.url
         article = Article(source=source, category=source, rss_url=rsp.request.url)
         gen_article_data(source, article, data)
         try:
             article.save()
         except (ArticleExsit, NotUniqueError):
             info = 'duplicated: %s' % article.source_url
             logging.info(info)
             return
         if article.top_images:
             article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id)
         if article.related_images:
             article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id)
         article.attach_feature(ArticleParser(article.text).gen_content_feature())
         article.set_usable()
         article.save()
         article.warm(warm_conn)
     except:
         raven_client.captureException()
Пример #2
0
    def save_to_db(self, item):
        url = item['url']
        data = item['data']

        if not data['type'] == 'article':
            return item
        text = data.get('text')

        if not text or len(text) < 140:
            return item

        try:
            article = Article.objects.get(source_url=url)
            gen_article_data(article, data)
            try:
                article.save()
            except (ArticleExsit, NotUniqueError):
                info = 'duplicated: %s' % article.source_url
                logging.info(info)
                return
            if article.top_images:
                article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id)
            if article.related_images:
                article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id)
            article.attach_feature(ArticleParser(article.text).gen_content_feature())
            article.set_usable()
            article.save()
            article.warm(warm_conn)
        except:
            raven_client.captureException()