Пример #1
0
def main():
    while True:
        if isKilled:
            break

        try:
            b_id, host, realm, last_crawl, last_post, succeed = database.get_meta()

            if not succeed:
                logger.log("Getting blog entity is failed")
                return

            try:
                article_list = get_article_list(host, realm, last_post)
            except Exception, e:
                logger.log("##ERROR:get_list", host, e.message)
                database.flag_rollback(b_id)
                continue

            logger.log(host, " [", len(article_list), "]")
            success_count = 0
            for article in article_list:

                try:
                    data = get_article(article, realm)
                except Exception, e:
                    logger.log("##ERROR:get_article", article, e.message)
                    continue

                if len(data) == 0:
                    continue
                if database.save_article(b_id, data):
                    success_count += 1

            logger.log(success_count, "accepted")
            database.flag(b_id, 0)
Пример #2
0
                    data = get_article(article, realm)
                except Exception, e:
                    logger.log("##ERROR:get_article", article, e.message)
                    continue

                if len(data) == 0:
                    continue
                if database.save_article(b_id, data):
                    success_count += 1

            logger.log(success_count, "accepted")
            database.flag(b_id, 0)

        except Exception, e:
            logger.log("##ERROR:global_error:", b_id, e.message)
            database.flag_rollback(b_id)



def get_article_list(host, realm=None, lp=None):
    if "http://" not in host:
        host = "http://" + host
    re = requests.get(host, headers={"User-agent": UserAgent}, timeout=5.0)

    article_list = []
    if re.status_code == 404:
        return article_list

    if realm == "Tistory" or "tistory.com" in host:
        article_list = mTistory.get_article_list(host, lp)
    #