Пример #1
0
 def __init__(self, config):
     '''
     Constructor
     '''
     self.url = config.get("url", "")
     self.tag = config.get("tag", "defaut tag")
     self.sub_tag = config.get("sub_tag", None)
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
Пример #2
0
 def run(self):
     """
     """
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
     self.threhold = 5
     self.page = 14
     LOGGER.info("start re extractor the failed url if count() < %s" %
                 (self.threhold, ))
     failed_count = 0
     try:
         failed_count = self.mysql_client.getOne(
             "select count(*) as c from failed_url where count < %s",
             (self.threhold, ))
     except Exception, e:
         LOGGER.error("failed to load the failed url count")
         LOGGER.error(traceback.format_exc())
Пример #3
0
class FailedExtractor(Daemon):
    '''
    classdocs
    '''
    def __init__(self,
                 pidfile,
                 stdin=os.devnull,
                 stdout=os.devnull,
                 stderr=os.devnull):
        '''
        Constructor
        '''
        super(FailedExtractor, self).__init__(pidfile, stdin, stdout, stderr)

    def run(self):
        """
        """
        self.mysql_client = MysqlClient()
        self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
        self.threhold = 5
        self.page = 14
        LOGGER.info("start re extractor the failed url if count() < %s" %
                    (self.threhold, ))
        failed_count = 0
        try:
            failed_count = self.mysql_client.getOne(
                "select count(*) as c from failed_url where count < %s",
                (self.threhold, ))
        except Exception, e:
            LOGGER.error("failed to load the failed url count")
            LOGGER.error(traceback.format_exc())
        failed_count = int(failed_count["c"])
        count = 0
        while count < failed_count:
            try:
                print count
                urls = self.mysql_client.getAll(
                    "select * from failed_url where count < %s limit %s, %s",
                    (self.threhold, count, self.page))
                if urls == False:
                    break
                count += len(urls)
                for url in urls:
                    LOGGER.info("re extractor url: %s" % (url["url"], ))
                    msg = self.mysql_client.getOne(
                        "select abstract, title from published_url where url = %s",
                        (url["url"], ))
                    url["title"] = msg["title"]
                    url["abstract"] = msg["abstract"]
                    self.news_publisher.process(url)
            except Exception, e:
                LOGGER.error("re extractor urls error")
                LOGGER.error(traceback.format_exc())
 def run(self):
     """
     """
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
     self.threhold = 5
     self.page = 14
     LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, ))
     failed_count = 0
     try:
         failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) )
     except Exception, e:
         LOGGER.error("failed to load the published url count")
         LOGGER.error(traceback.format_exc())
class PublishedExtractor(Daemon):
    '''
    classdocs
    '''
    def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ):
        '''
        Constructor
        '''
        super(PublishedExtractor, self).__init__(pidfile , stdin, stdout, stderr)
        
    
    def run(self):
        """
        """
        self.mysql_client = MysqlClient()
        self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
        self.threhold = 5
        self.page = 14
        LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, ))
        failed_count = 0
        try:
            failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) )
        except Exception, e:
            LOGGER.error("failed to load the published url count")
            LOGGER.error(traceback.format_exc())
        failed_count = int(failed_count["c"])
        count = 0
        while count < failed_count:
            try:
                print count
                urls = self.mysql_client.getAll("select * from published_url where count < %s limit %s, %s", (self.threhold, count, self.page))
                if urls == False:
                    break
                count += len(urls)
                for url in urls:
                    LOGGER.info("re extractor url: %s"%(url["url"], ))
#                     msg = self.mysql_client.getOne("select abstract, title from published_url where url = %s", (url["url"], ))
#                     url["title"] = msg["title"]
#                     url["abstract"] = msg["abstract"]
                    self.news_publisher.process(url)
            except Exception, e:
                LOGGER.error("re extractor urls error")
                LOGGER.error(traceback.format_exc())
Пример #6
0
class BaseExtractor(object):
    '''
    classdocs
    '''
    def __init__(self, config):
        '''
        Constructor
        '''
        self.url = config.get("url", "")
        self.tag = config.get("tag", "defaut tag")
        self.sub_tag = config.get("sub_tag", None)
        self.mysql_client = MysqlClient()
        self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)

    def extract_links(self):
        """
        extractor links from url
        """

    def formatMsg(self, url, tag, sub_tag, title, abstract, priority=0):
        msg = {}
        msg["url"] = url
        msg["tag"] = tag
        msg["sub_tag"] = sub_tag
        msg["title"] = title
        msg["abstract"] = abstract
        msg["__priority"] = priority
        msg["version"] = VERSION
        msg["create_time"] = int(time.time() * 1000)

        return msg

    def isPublished(self, url):
        try:
            url_is_exists = self.mysql_client.getOne(
                "select * from published_url where url=%s", (url, ))
            if url_is_exists == False:
                return False
            else:
                return True
        except Exception, e:
            return True
 def __init__(self):
     self.mysql_client = MysqlClient()
     self.mongo_client = MongoClient().tdb.tcoll
class  BasicArticleCrawler(object):
    
    def __init__(self):
        self.mysql_client = MysqlClient()
        self.mongo_client = MongoClient().tdb.tcoll
    
    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
#             print article
#             print msg["url"]
            
            article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s", msg["url"])

            article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s", msg["url"])
                return
            
            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"]))
            
            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]));
                                        
            LOGGER.debug("insert successed_url %s" %(msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s"  %(msg["url"]))
            LOGGER.error(traceback.format_exc())
Пример #9
0
 def __init__(self):
     self.mysql_client = MysqlClient()
     self.mongo_client = MongoClient().tdb.tcoll
Пример #10
0
class BasicArticleCrawler(object):
    def __init__(self):
        self.mysql_client = MysqlClient()
        self.mongo_client = MongoClient().tdb.tcoll

    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
            #             print article
            #             print msg["url"]

            article = self.mysql_client.getOne(
                "select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete(
                    "delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s",
                            msg["url"])

            article = self.mysql_client.getOne(
                "select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s",
                            msg["url"])
                return

            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %
                         (msg["title"], msg["url"]))

            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]))

            LOGGER.debug("insert successed_url %s" % (msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s" %
                         (msg["url"]))
            LOGGER.error(traceback.format_exc())
@author: lml
'''

import sys
sys.path.append("../")
sys.path.append("../../")
sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/")
from utils.dbmong import MongoClient
from utils.dbmysql import MysqlClient
from time import sleep


if __name__ == '__main__':
    while True:
        print "**************************************************"
        mysql_client = MysqlClient()
        mongo_client =  MongoClient()
        published_url_count  = mysql_client.getOne("select count(*) as count from published_url")
        print "published url count: %s"%published_url_count["count"]
        
        successed_url_count  = mysql_client.getOne("select count(*) as count from successed_url")
        print "successed url count: %s"%successed_url_count["count"]
        
        failed_url_count  = mysql_client.getOne("select count(*) as count from failed_url")
        print "failed url count: %s"%failed_url_count["count"]
        
        count = mongo_client.tdb.tcoll.count()
        print "mongo articles: %s"%count
        sleep(10)
        print ""
Пример #12
0
@author: lml
'''

import sys
sys.path.append("../")
sys.path.append("../../")
sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/")
from utils.dbmong import MongoClient
from utils.dbmysql import MysqlClient
from time import sleep

if __name__ == '__main__':
    while True:
        print "**************************************************"
        mysql_client = MysqlClient()
        mongo_client = MongoClient()
        published_url_count = mysql_client.getOne(
            "select count(*) as count from published_url")
        print "published url count: %s" % published_url_count["count"]

        successed_url_count = mysql_client.getOne(
            "select count(*) as count from successed_url")
        print "successed url count: %s" % successed_url_count["count"]

        failed_url_count = mysql_client.getOne(
            "select count(*) as count from failed_url")
        print "failed url count: %s" % failed_url_count["count"]

        count = mongo_client.tdb.tcoll.count()
        print "mongo articles: %s" % count