Пример #1
0
def sync_weixin(name="weixin"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            data["publisher"] = get_weixin_publisher(data["publisher"]) \
              if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data)

            data["type"] = "gov"
            weixin = save_weixin(data) # Weixin Model

            relateddata = save_relateddata(data)

            result = cal_values("weixin", weixin.title, weixin.pubtime)

            for r in result:
                save_relateddata_weixin(relateddata, r) # Save relateddata for article
                r_id = get_relateddata(r.uuid) \
                            if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid})
                save_relateddata_weixin(r_id, weixin) # Save relateddata for r

            logger.info("SYNC SUCCEED WEIXIN<%s>" % weixin.uuid)
        else:
            time.sleep(10)

    except Exception:   
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #2
0
    def run(self):
        while True:
            if not self.runValue.value:
                print "%s stops" % self.name
                break
            self.signalget()
            start_t = time.time()
            self.ctask = self.taskqueue.get()
            if self.ctask.empty:
                time.sleep(10)
                continue
            end_t = time.time()
            self.log_wait_task(end_t - start_t)
            self.log_get_task()
            start_t = time.time()

            c = Crawler().create(self.ctask.type, self.ctask.key, self.ctask.data)
            if c:
                try:
                    c.crawl()
                    success = True
                    logger.info("CRAWL SUCCEED - <%s> %s" % (self.taskqueue.queueid, c))
                    end_t = time.time()
                    self.log_done_task(end_t - start_t)
                except Exception:
                    msg = get_exception_info()
                    success = False
                    logger.error("CRAWL FAILED - <%s> %s, %s" % (self.taskqueue.queueid, c, msg))
            else:
                logger.error("CRAWL FAILED - <%s> %s" % (self.taskqueue.queueid, self.ctask))
                success = False

            Scheduler.finish(self.ctask.type, self.ctask.key, c.data if c else {}, success)
Пример #3
0
def sync_article(name="article"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            data["publisher"] = get_article_publisher(data["publisher"]) \
                if article_publisher_count(data["publisher"]) else save_article_publisher(data)

            article = save_article(data)

            category = get_category(data['title'], data['content'])

            article_category_articles = save_article_category_articles(
                data, article, category)

            relateddata = save_relateddata(data)

            result = cal_values("article", article.title, article.pubtime)

            for r in result:
                save_relateddata_articles(relateddata, r) # Save relateddata for article
                save_relateddata_articles(get_relateddata(r.uuid), article) # Save relateddata for r

            logger.info("SYNC SUCCEED ARTICLE<%s>" % article.uuid)
        else:
            time.sleep(10)

    except Exception, e:
        raise e  
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #4
0
def sync_weibo(name="weibo"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            weibo = save_weibo(data) # Weibo Model

            logger.info("SYNC SUCCEED WEIBO<%s>" % weibo.uuid)
        else:
            time.sleep(10)

    except Exception:   
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #5
0
def sync_topic(name="topic"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            if data["source"] == u"baidu":
                data["searchmode"] = 1
                data["publisher"] = get_article_publisher(data["publisher"]) \
                    if article_publisher_count(data["publisher"]) else save_article_publisher(data)

                article = save_article(data)

                topic_article = save_topic_article(data, article)
                
                article_category_articles = save_article_category_articles(
                    data, article, u"其他")

                relateddata = save_relateddata(data)

                result = cal_values("article", article.title, article.pubtime)

                for r in result:
                    save_relateddata_articles(relateddata, r) # Save relateddata for article
                    save_relateddata_articles(get_relateddata(r.uuid), article) # Save relateddata for r

                logger.info("SYNC SUCCEED TOPICARTICLE<%s>" % article.uuid)

            if data["source"] == u"sogou":
                data["publisher"] = get_weixin_publisher(data["publisher"]) \
                  if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data)

                weixin = save_weixin(data)

                topic_weixin = save_topic_weixin(data, weixin)

                relateddata = save_relateddata(data)

                result = cal_values("weixin", weixin.title, weixin.pubtime)

                for r in result:
                    save_relateddata_weixin(relateddata, r) # Save relateddata for article
                    save_relateddata_weixin(get_relateddata(r.uuid), weixin) # Save relateddata for r

                logger.info("SYNC SUCCEED TOPICWEIXIN<%s>" % weixin.uuid)
        else:
            time.sleep(10)

    except Exception:   
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #6
0
def sync_weibo(name="weibo"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            data["publisher"] = get_weibo_publisher(data["publisher"]) \
              if weibo_publisher_count(data["publisher"]) else save_weibo_publisherublisher(data)

            data["type"] = "gov"
            weibo = save_weibo(data) # Weibo Model

            logger.info("SYNC SUCCEED WEIBO<%s>" % weibo.uuid)
        else:
            time.sleep(10)

    except Exception:   
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #7
0
    def run(self):
        while True:
            if not self.runValue.value:
                print "%s stops" % self.name
                break
            self.signalget()
            start_t = time.time()
            self.ctask = self.taskqueue.get()
            if self.ctask.empty:
                time.sleep(10)
                continue
            end_t = time.time()
            self.log_wait_task(end_t - start_t)
            self.log_get_task()
            start_t = time.time()

            c = Crawler().create(self.ctask.type, self.ctask.key,
                                 self.ctask.data)
            if c:
                try:
                    c.crawl()
                    success = True
                    logger.info("CRAWL SUCCEED - <%s> %s" %
                                (self.taskqueue.queueid, c))
                    end_t = time.time()
                    self.log_done_task(end_t - start_t)
                except Exception:
                    msg = get_exception_info()
                    success = False
                    logger.error("CRAWL FAILED - <%s> %s, %s" %
                                 (self.taskqueue.queueid, c, msg))
            else:
                logger.error("CRAWL FAILED - <%s> %s" %
                             (self.taskqueue.queueid, self.ctask))
                success = False

            Scheduler.finish(self.ctask.type, self.ctask.key,
                             c.data if c else {}, success)
Пример #8
0
def sync_article(name="article"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)

            data["publisher"] = get_article_publisher(data["publisher"]) \
                if article_publisher_count(data["publisher"]) else save_article_publisher(data)
            data["area"] = get_area(data["province"], data["city"], data["district"])


            data["type"] = "gov"
            article = save_article(data)

            category = get_category(data['title'], data['content'])

            article_category_articles = save_article_category_articles(
                data, article, category)

            relateddata = save_relateddata(data)

            if not filter_data(data): # do not cal_values
                result = cal_values("article", article.title, article.pubtime)

                for r in result:
                    save_relateddata_articles(relateddata, r) # Save relateddata for article
                    r_id = get_relateddata(r.uuid) \
                            if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid})
                    save_relateddata_articles(r_id, article) # Save relateddata for r

            logger.info("SYNC SUCCEED ARTICLE<%s>" % article.uuid)
        else:
            time.sleep(10)

    except Exception, e:
        # raise e  
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)
Пример #9
0
def sync_topic(name="topic"):
    try:
        if queue_len(name) > 1:
            data = deal_rpop(name)
            words = extract_title(data["title"].decode("utf-8"))
            query_areas = get_area_by_words(words)
            areas = []
            for a in query_areas:
                if a.count() > 0:
                    areas.append(a)

            if len(areas) == 0:
                areas = [get_area()]

            for a in areas:
                data["area"] = a[0]
                if data["source"] == u"baidu":
                    data["searchmode"] = 1
                    data["publisher"] = get_article_publisher(data["publisher"]) \
                        if article_publisher_count(data["publisher"]) else save_article_publisher(data)

                    if data["source_type"] == "事件":
                        data["type"] = "topic"
                        article = save_article(data)
                        topic_article = save_topic_article(data, article)

                    elif data["source_type"] == "关键词":
                        data["type"] = "custom"
                        article = save_article(data)
                        if custom_count(data["key"]):
                            custom = get_custom(data["key"])
                        else :
                            custom = save_custom(data)
                            keyword = update_keyword(custom, data)
                        custom_article = save_custom_article(data, article, custom)
                    
                    article_category_articles = save_article_category_articles(
                        data, article, u"其他")

                    relateddata = save_relateddata(data)

                    result = cal_values("article", article.title, article.pubtime)

                    for r in result:
                        save_relateddata_articles(relateddata, r) # Save relateddata for article
                        r_id = get_relateddata(r.uuid) \
                            if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid})
                        save_relateddata_articles(r_id, article) # Save relateddata for r

                    logger.info("SYNC SUCCEED TOPICARTICLE<%s>" % article.uuid)

                elif data["source"] == u"sogou":
                    data["publisher"] = get_weixin_publisher(data["publisher"]) \
                        if weixin_publisher_count(data["publisher"]) else save_weixin_publisher(data)

                    if data["source_type"] == "事件":
                        data["type"] = "topic"
                        weixin = save_weixin(data)                    
                        topic_weixin = save_topic_weixin(data, weixin)

                    elif data["source_type"] == "关键词":
                        data["type"] = "custom"
                        weixin = save_weixin(data)
                        if custom_count(data["key"]):
                            custom = get_custom(data["key"])
                        else :
                            custom = save_custom(data)
                            keyword = update_keyword(custom, data)                          
                        custom_weixin = save_custom_weixin(data, weixin)

                    relateddata = save_relateddata(data)

                    result = cal_values("weixin", weixin.title, weixin.pubtime)

                    for r in result:
                        save_relateddata_weixin(relateddata, r) # Save relateddata for article
                        r_id = get_relateddata(r.uuid) \
                            if relateddata_count(r.uuid) else save_relateddata({'id':r.uuid})
                        save_relateddata_weixin(r_id, weixin) # Save relateddata for r

                    logger.info("SYNC SUCCEED TOPICWEIXIN<%s>" % weixin.uuid)

                elif data["source"] == u"weibo":
                    data["publisher"] = get_weibo_publisher(data["publisher"]) \
                        if weibo_publisher_count(data["publisher"]) else save_weibo_publisher(data)

                    data["attitudes_count"] = 0
                    data["comments_count"] = 0
                    data["reposts_count"] = 0
                    if data["source_type"] == "事件":
                        data["type"] = "topic"
                        weibo = save_weibo(data)
                        topic_weibo = save_topic_weibo(data, weibo)

                    elif data["source_type"] == "关键词":
                        data["type"] = "custom"
                        weibo = save_weibo(data)
                        if custom_count(data["key"]):
                            custom = get_custom(data["key"])
                        else :
                            custom = save_custom(data)
                            keyword = update_keyword(custom, data)
                        custom_weibo = save_custom_weibo(data, weibo)

                    logger.info("SYNC SUCCEED TOPICWEIBO<%s>" % weibo.uuid)

        else:
            time.sleep(10)

    except Exception:
        msg = get_exception_info()
        logger.error("SYNC FAILED %s" % msg)