def sync_weixin(name="weixin"): try: if queue_len(name) > 1: data = deal_rpop(name) data["publisher"] = get_weixin_publisher(data["publisher"]) \ if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data) data["type"] = "gov" weixin = save_weixin(data) # Weixin Model relateddata = save_relateddata(data) result = cal_values("weixin", weixin.title, weixin.pubtime) for r in result: save_relateddata_weixin(relateddata, r) # Save relateddata for article r_id = get_relateddata(r.uuid) \ if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid}) save_relateddata_weixin(r_id, weixin) # Save relateddata for r logger.info("SYNC SUCCEED WEIXIN<%s>" % weixin.uuid) else: time.sleep(10) except Exception: msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def run(self): while True: if not self.runValue.value: print "%s stops" % self.name break self.signalget() start_t = time.time() self.ctask = self.taskqueue.get() if self.ctask.empty: time.sleep(10) continue end_t = time.time() self.log_wait_task(end_t - start_t) self.log_get_task() start_t = time.time() c = Crawler().create(self.ctask.type, self.ctask.key, self.ctask.data) if c: try: c.crawl() success = True logger.info("CRAWL SUCCEED - <%s> %s" % (self.taskqueue.queueid, c)) end_t = time.time() self.log_done_task(end_t - start_t) except Exception: msg = get_exception_info() success = False logger.error("CRAWL FAILED - <%s> %s, %s" % (self.taskqueue.queueid, c, msg)) else: logger.error("CRAWL FAILED - <%s> %s" % (self.taskqueue.queueid, self.ctask)) success = False Scheduler.finish(self.ctask.type, self.ctask.key, c.data if c else {}, success)
def sync_article(name="article"): try: if queue_len(name) > 1: data = deal_rpop(name) data["publisher"] = get_article_publisher(data["publisher"]) \ if article_publisher_count(data["publisher"]) else save_article_publisher(data) article = save_article(data) category = get_category(data['title'], data['content']) article_category_articles = save_article_category_articles( data, article, category) relateddata = save_relateddata(data) result = cal_values("article", article.title, article.pubtime) for r in result: save_relateddata_articles(relateddata, r) # Save relateddata for article save_relateddata_articles(get_relateddata(r.uuid), article) # Save relateddata for r logger.info("SYNC SUCCEED ARTICLE<%s>" % article.uuid) else: time.sleep(10) except Exception, e: raise e msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def sync_weibo(name="weibo"): try: if queue_len(name) > 1: data = deal_rpop(name) weibo = save_weibo(data) # Weibo Model logger.info("SYNC SUCCEED WEIBO<%s>" % weibo.uuid) else: time.sleep(10) except Exception: msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def sync_topic(name="topic"): try: if queue_len(name) > 1: data = deal_rpop(name) if data["source"] == u"baidu": data["searchmode"] = 1 data["publisher"] = get_article_publisher(data["publisher"]) \ if article_publisher_count(data["publisher"]) else save_article_publisher(data) article = save_article(data) topic_article = save_topic_article(data, article) article_category_articles = save_article_category_articles( data, article, u"其他") relateddata = save_relateddata(data) result = cal_values("article", article.title, article.pubtime) for r in result: save_relateddata_articles(relateddata, r) # Save relateddata for article save_relateddata_articles(get_relateddata(r.uuid), article) # Save relateddata for r logger.info("SYNC SUCCEED TOPICARTICLE<%s>" % article.uuid) if data["source"] == u"sogou": data["publisher"] = get_weixin_publisher(data["publisher"]) \ if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data) weixin = save_weixin(data) topic_weixin = save_topic_weixin(data, weixin) relateddata = save_relateddata(data) result = cal_values("weixin", weixin.title, weixin.pubtime) for r in result: save_relateddata_weixin(relateddata, r) # Save relateddata for article save_relateddata_weixin(get_relateddata(r.uuid), weixin) # Save relateddata for r logger.info("SYNC SUCCEED TOPICWEIXIN<%s>" % weixin.uuid) else: time.sleep(10) except Exception: msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def sync_weibo(name="weibo"): try: if queue_len(name) > 1: data = deal_rpop(name) data["publisher"] = get_weibo_publisher(data["publisher"]) \ if weibo_publisher_count(data["publisher"]) else save_weibo_publisherublisher(data) data["type"] = "gov" weibo = save_weibo(data) # Weibo Model logger.info("SYNC SUCCEED WEIBO<%s>" % weibo.uuid) else: time.sleep(10) except Exception: msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def sync_article(name="article"): try: if queue_len(name) > 1: data = deal_rpop(name) data["publisher"] = get_article_publisher(data["publisher"]) \ if article_publisher_count(data["publisher"]) else save_article_publisher(data) data["area"] = get_area(data["province"], data["city"], data["district"]) data["type"] = "gov" article = save_article(data) category = get_category(data['title'], data['content']) article_category_articles = save_article_category_articles( data, article, category) relateddata = save_relateddata(data) if not filter_data(data): # do not cal_values result = cal_values("article", article.title, article.pubtime) for r in result: save_relateddata_articles(relateddata, r) # Save relateddata for article r_id = get_relateddata(r.uuid) \ if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid}) save_relateddata_articles(r_id, article) # Save relateddata for r logger.info("SYNC SUCCEED ARTICLE<%s>" % article.uuid) else: time.sleep(10) except Exception, e: # raise e msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)
def sync_topic(name="topic"): try: if queue_len(name) > 1: data = deal_rpop(name) words = extract_title(data["title"].decode("utf-8")) query_areas = get_area_by_words(words) areas = [] for a in query_areas: if a.count() > 0: areas.append(a) if len(areas) == 0: areas = [get_area()] for a in areas: data["area"] = a[0] if data["source"] == u"baidu": data["searchmode"] = 1 data["publisher"] = get_article_publisher(data["publisher"]) \ if article_publisher_count(data["publisher"]) else save_article_publisher(data) if data["source_type"] == "事件": data["type"] = "topic" article = save_article(data) topic_article = save_topic_article(data, article) elif data["source_type"] == "关键词": data["type"] = "custom" article = save_article(data) if custom_count(data["key"]): custom = get_custom(data["key"]) else : custom = save_custom(data) keyword = update_keyword(custom, data) custom_article = save_custom_article(data, article, custom) article_category_articles = save_article_category_articles( data, article, u"其他") relateddata = save_relateddata(data) result = cal_values("article", article.title, article.pubtime) for r in result: save_relateddata_articles(relateddata, r) # Save relateddata for article r_id = get_relateddata(r.uuid) \ if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid}) save_relateddata_articles(r_id, article) # Save relateddata for r logger.info("SYNC SUCCEED TOPICARTICLE<%s>" % article.uuid) elif data["source"] == u"sogou": data["publisher"] = get_weixin_publisher(data["publisher"]) \ if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data) if data["source_type"] == "事件": data["type"] = "topic" weixin = save_weixin(data) topic_weixin = save_topic_weixin(data, weixin) elif data["source_type"] == "关键词": data["type"] = "custom" weixin = save_weixin(data) if custom_count(data["key"]): custom = get_custom(data["key"]) else : custom = save_custom(data) keyword = update_keyword(custom, data) custom_weixin = save_custom_weixin(data, weixin) relateddata = save_relateddata(data) result = cal_values("weixin", weixin.title, weixin.pubtime) for r in result: save_relateddata_weixin(relateddata, r) # Save relateddata for article r_id = get_relateddata(r.uuid) \ if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid}) save_relateddata_weixin(r_id, weixin) # Save relateddata for r logger.info("SYNC SUCCEED TOPICWEIXIN<%s>" % weixin.uuid) elif data["source"] == u"weibo": data["publisher"] = get_weibo_publisher(data["publisher"]) \ if weibo_publisher_count(data["publisher"]) else save_weibo_publisher(data) data["attitudes_count"] = 0 data["comments_count"] = 0 data["reposts_count"] = 0 if data["source_type"] == "事件": data["type"] = "topic" weibo = save_weibo(data) topic_weibo = save_topic_weibo(data, weibo) elif data["source_type"] == "关键词": data["type"] = "custom" weibo = save_weibo(data) if custom_count(data["key"]): custom = get_custom(data["key"]) else : custom = save_custom(data) keyword = update_keyword(custom, data) custom_weibo = save_custom_weibo(data, weibo) logger.info("SYNC SUCCEED TOPICWEIBO<%s>" % weibo.uuid) else: time.sleep(10) except Exception: msg = get_exception_info() logger.error("SYNC FAILED %s" % msg)