示例#1
0
    def save_to_db(self, item):
        url = item['url']
        data = item['data']

        if not data['type'] == 'article':
            return item
        text = data.get('text')

        if not text or len(text) < 140:
            return item

        try:
            article = Article.objects.get(source_url=url)
            gen_article_data(article, data)
            try:
                article.save()
            except (ArticleExsit, NotUniqueError):
                info = 'duplicated: %s' % article.source_url
                logging.info(info)
                return
            if article.top_images:
                article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id)
            if article.related_images:
                article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id)
            article.attach_feature(ArticleParser(article.text).gen_content_feature())
            article.set_usable()
            article.save()
            article.warm(warm_conn)
        except:
            raven_client.captureException()
示例#2
0
 def _process_article(self, source, rsp):
     try:
         if rsp is None:
             return
         if not rsp:
             info = 'download error: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         if not rsp.content:
             info = 'rsp no content: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         data = parse_data(rsp.content)
         if not data.get('content'):
             return
         data['origin_url'] = rsp.url
         article = Article(source=source, category=source, rss_url=rsp.request.url)
         gen_article_data(source, article, data)
         try:
             article.save()
         except (ArticleExsit, NotUniqueError):
             info = 'duplicated: %s' % article.source_url
             logging.info(info)
             return
         if article.top_images:
             article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id)
         if article.related_images:
             article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id)
         article.attach_feature(ArticleParser(article.text).gen_content_feature())
         article.set_usable()
         article.save()
         article.warm(warm_conn)
     except:
         raven_client.captureException()
示例#3
0
文件: utils.py 项目: ICCV/chaos
def gen_article_data(article, data):
    score = data.get('score', 0)
    article.hot = 500 if 'Top Stories' in article.category else 250
    article.quality = score
    article.source_url = normalize_url(article.source_url)
    article.title = data.get('title')
    pubdate = data.get('pubdate')
    article.pubdate = pubdate
    now = datetime.datetime.utcnow()

    try:
        if not pubdate:
            article.published_at = now
        else:
            published_at = timestr2utc(pubdate)
            published_at = published_at.replace(tzinfo=None)
            if published_at > now:
                delta = datetime.timedelta(hours=4)
                article.published_at = published_at - delta
            else:
                article.published_at = published_at
    except:
        from crawler.settings import raven_client
        raven_client.captureException()
        article.published_at = now

    article.description = data.get('description')
    keywords = data.get('keywords')
    article.keywords = keywords.split(', ') if keywords else None
    article.related_images = data.get('image_detail')
    image = data.get('image')
    article.top_images = [{'url': image}] if image else None
    article.text = data.get('text')
    article.content = data.get('content')