def g_news_item(self, article, start_url="", meta=None): if article.get("has_video"): return None docid = article["source_url"] crawl_url = self._g_crawl_url(article) key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news( crawl_url=crawl_url, key=key, title=article["title"], tags=article.get("keywords", "").split(","), summary=article.get("abstract", ""), publish_time=str_from_timestamp(article["publish_time"]), love=article.get("favorite_count", 0), up=article.get("digg_count", 0), down=article.get("bury_count", 0), original_url=article.get("url", ""), original_source=article.get("source", ""), crawl_source=CRAWL_SOURCE, start_url=start_url, start_meta_info=meta, comment_url=self._g_comment_url(docid), comment_queue=COMMENT_SPIDER_NAME + ":start_urls") news["docid"] = news["comment_url"] return news
def g_news_item(self, article, start_url="", meta=None): news = NewsItem() news["docid"] = article["docID"] url_163 = article.get("url_163", None) if url_163 is None: return None news["crawl_url"] = self._g_crawl_url(url_163) news["key"] = g_cache_key(news["crawl_url"]) if news_already_exists(news["key"]): return None news["title"] = article["title"] news["tags"] = list() news["summary"] = article["summary"] news["publish_time"] = str_from_timestamp(article["publish_time"] / 1000) news["content"] = list() position = article.get("position", "null,null,null,null").split(",") news["province"] = position[0] if position[0] != "null" else None news["city"] = position[1] if position[1] != "null" else None news["district"] = position[2] if position[2] != "null" else None news["love"] = 0 news["up"] = 0 news["down"] = 0 news["original_url"] = article.get("doc_url", "") news["channel"] = article.get("channel", "/").split("/")[0] news["category"] = article.get("category", "") news["crawl_source"] = CRAWL_SOURCE news["original_source"] = article.get("source", "") if news['original_source'] == u'糗事百科': return None news["comment_url"] = self._g_comment_url(docid=news["docid"]) news["comment_queue"] = COMMENT_SPIDER_NAME + ":start_urls" news["start_url"] = start_url news["start_meta_info"] = meta return news
def g_news_item(self, article, start_url="", meta=None): if article["ctype"] not in ["news", "picture"]: return None # fixme: only support news now docid = article["docid"] crawl_url = self._g_article_url(article.get("url"), docid) if not crawl_url: return None key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(crawl_url=crawl_url, key=key, title=article["title"], summary=article.get("summary", ""), publish_time=article["date"], love=article.get("like", 0), up=article.get("up", 0), original_url=article.get("url", ""), crawl_source=CRAWL_SOURCE, original_source=article.get("source", ""), start_url=start_url, start_meta_info=meta, comment_url=self._g_comment_url(docid), comment_queue=COMMENT_SPIDER_NAME + ":start_urls") news["docid"] = news["comment_url"] return news
def g_news_item(self, article, start_url="", meta=None): """ 生成 news item 对象 :param article: 包含新闻 url, title 字段的字典 :type article: dict :param start_url: 抓取 meta info 的起始 url :type start_url: str :param meta: 附加配置信息 :type meta: dict :return: 新闻 Item :rtype: News.items.NewsItem | None """ crawl_url = article["url"] key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(title=article["title"], crawl_url=crawl_url, docid=crawl_url, key=key, crawl_source=self.crawl_source, start_url=start_url, summary=article.get("summary", ""), start_meta_info=meta) return news
def g_news_item(self, article, start_url="", meta=None): docid = article['url'] crawl_url = self._g_article_url(article['url']) if not crawl_url: return None key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(crawl_url=crawl_url, key=key, title=article['title'], docid=docid, start_meta_info=meta, crawl_source=CRAWL_SOURCE) return news
def g_news_item(self, article, start_url="", meta=None): crawl_url = article["crawl_url"] comment_url = self._g_comment_url(crawl_url) news = get_default_news( crawl_url=crawl_url, docid=comment_url, key=g_cache_key(article["title"].encode("utf-8")), crawl_source=CRAWL_SOURCE, start_url=start_url, summary=article["summary"], publish_time=article["publish_time"], title=article["title"], start_meta_info=meta, comment_url=comment_url, comment_queue=COMMENT_SPIDER_NAME+":start_urls" ) return None if news_already_exists(news["key"]) else news
def g_news_item(self, article, start_url="", meta=None): news = NewsItem() news["docid"] = article["docid"] news["crawl_url"] = self._g_crawl_url(news['docid']) news["key"] = g_cache_key(news["crawl_url"]) if news_already_exists(news["key"]): return None news["title"] = article["title"] if 'ltitle' in news and news['ltitle']: news['summary'] = article['ltitle'] news["tags"] = list() news["publish_time"] = article["ptime"] news["content"] = list() news["love"] = 0 news["up"] = 0 news["down"] = 0 news["crawl_source"] = CRAWL_SOURCE news['original_url'] = '' news["original_source"] = article.get('source', '') # news["comment_url"] = self._g_comment_url(docid=news["docid"]) news["start_url"] = start_url news["start_meta_info"] = meta return news