def save_to_db(self, item): url = item['url'] data = item['data'] if not data['type'] == 'article': return item text = data.get('text') if not text or len(text) < 140: return item try: article = Article.objects.get(source_url=url) gen_article_data(article, data) try: article.save() except (ArticleExsit, NotUniqueError): info = 'duplicated: %s' % article.source_url logging.info(info) return if article.top_images: article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id) if article.related_images: article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id) article.attach_feature(ArticleParser(article.text).gen_content_feature()) article.set_usable() article.save() article.warm(warm_conn) except: raven_client.captureException()
def _process_article(self, source, rsp): try: if rsp is None: return if not rsp: info = 'download error: %s' % rsp.request.url logging.error(info) raise Exception(info) if not rsp.content: info = 'rsp no content: %s' % rsp.request.url logging.error(info) raise Exception(info) data = parse_data(rsp.content) if not data.get('content'): return data['origin_url'] = rsp.url article = Article(source=source, category=source, rss_url=rsp.request.url) gen_article_data(source, article, data) try: article.save() except (ArticleExsit, NotUniqueError): info = 'duplicated: %s' % article.source_url logging.info(info) return if article.top_images: article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id) if article.related_images: article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id) article.attach_feature(ArticleParser(article.text).gen_content_feature()) article.set_usable() article.save() article.warm(warm_conn) except: raven_client.captureException()
def gen_article_data(article, data): score = data.get('score', 0) article.hot = 500 if 'Top Stories' in article.category else 250 article.quality = score article.source_url = normalize_url(article.source_url) article.title = data.get('title') pubdate = data.get('pubdate') article.pubdate = pubdate now = datetime.datetime.utcnow() try: if not pubdate: article.published_at = now else: published_at = timestr2utc(pubdate) published_at = published_at.replace(tzinfo=None) if published_at > now: delta = datetime.timedelta(hours=4) article.published_at = published_at - delta else: article.published_at = published_at except: from crawler.settings import raven_client raven_client.captureException() article.published_at = now article.description = data.get('description') keywords = data.get('keywords') article.keywords = keywords.split(', ') if keywords else None article.related_images = data.get('image_detail') image = data.get('image') article.top_images = [{'url': image}] if image else None article.text = data.get('text') article.content = data.get('content')