Exemplo n.º 1
0
 def __init__(self, siteinfo=None, callback=callable):
     self.siteinfo = siteinfo
     self.callBack = callback
     self.globalSettings = Settings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
     self.getSettings()
Exemplo n.º 2
0
 def __init__(self):
     settings = Settings()
     settings.CreateCommonSettings()
     self.file = FileIOMiddleware()
     self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT)
     self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS)
     self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT)
     self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS)
     self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL
     self.bf_weixin_url = BloomFilter(self.rconn,
                                      settings.FINISHED_WEIXIN_URL_ARTICLE)
     self.bf_weixin_content = BloomFilter(
         self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE)
     self.bf_weixin_id = BloomFilter(self.rconn,
                                     settings.FINISHED_WEIXIN_URL_ID)
     self.bf_finished_image_id = BloomFilter(self.rconn,
                                             settings.FINISHED_IMAGE_ID)
     self.bf_finished_temp_weixin = BloomFilter(
         self.rconn, settings.FINISHED_TEMP_WEIXIN)
     self.md5 = hashlib.md5()
     self.max_concurrency = settings.MAX_CONCURRENCY
     self.concurrency_file = settings.CONCURRENCY_FILE
     self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE
     self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL
     self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER
     self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER
     self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER
     self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER
     self.bf_huxiu_nlp = BloomFilter(self.rconn,
                                     settings.FINISHED_HUXIU_NLP)
     self.sites_info = settings.SITES_INFO
     self.sites_debug = settings.SITES_DEBUG
Exemplo n.º 3
0
 def __init__(self, settingName, callback=callable):
     self.settingName = settingName
     self.callBack = callback
     self.globalSettings = Settings()
     self.getSettings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
Exemplo n.º 4
0
 def __init__(self):
     self.settings = Settings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
     self.root = '/home/dev/Data/rsyncData/prd4/sites'
     self.dest = '/home/dev/Data/rsyncData/prd4/local'
     self.resume = '/home/dev/Repository/news/Tegenaria/tSpider/tSpider/dataRecovery/resume.txt'
Exemplo n.º 5
0
    def __init__(self):
        self.doraemon = Doraemon()
        self.file = FileIOMiddleware()
        self.settings = Settings()

        self.cache_file = self.settings.TIMEOUT_CACHE_FILE
        self.timeout = self.settings.PROCESS_TIMEOUT
        self.timeout_content = self.settings.PROCESS_TIMEOUT_CONTENT
Exemplo n.º 6
0
 def __init__(self, fileDirectory=None,
                    fileName=None,
                    bucketName=None,
                    bucketFolderName=None):
     self.settins = Settings()
     self.fileDirectory = fileDirectory
     self.fileName = fileName
     self.bucketName = bucketName
     self.bucketFolderName = bucketFolderName
     # 阿里云主账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录 https://ram.console.aliyun.com 创建RAM账号。
     auth = oss2.Auth(self.settins.ALI_OSS_INFO.ip, self.settins.ALI_OSS_INFO.password)
     # Endpoint以杭州为例,其它Region请按实际情况填写。
     self.bucket = oss2.Bucket(auth, 'http://oss-cn-beijing.aliyuncs.com', '{0}'.format(self.bucketName))
Exemplo n.º 7
0
class NoNameBone():
    def __init__(self, settingName, callback=callable):
        self.settingName = settingName
        self.callBack = callback
        self.globalSettings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.settingName)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.author_path = self.settings.AUTHORS_PATH
        self.name = self.settings.NAME

    def store(self):
        result = self.callBack()
        if result == None:
            return
        print 'Start to store authors for page: {0}'.format(result.page_url)
        if len(result.authors) == 0:
            message1 = 'No author for page: {0}'.format(result.page_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in result.authors:
            is_title_empty = self.doraemon.isEmpty(item)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_authors, item) is False):
                message2 = 'Start to store author: {0} for page: {1}.'.format(
                    item, result.page_url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeTxtAdd(self.author_path, item,
                                          self.settingName)
                message3 = 'Success to store author: {0} for page: {1}.'.format(
                    item, result.page_url)
                self.file.logger(self.log_path, message3)
                print message3
            else:
                if is_title_empty is True:
                    message4 = 'Empty author for {0}'.format(result.page_url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    message5 = 'Duplicated author for {0}'.format(
                        result.page_url)
                    self.file.logger(self.log_path, message5)
                    print message5
        print 'End to store author for page: {0}.'.format(result.page_url)
        del result
        gc.collect()
Exemplo n.º 8
0
 def __init__(self):
     self.settings = Settings()
     self.getSettings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
     self.doraemon.createFilePath(self.work_path_prd2)
Exemplo n.º 9
0
 def __init__(self):
     self.settings = Settings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
Exemplo n.º 10
0
 def __init__(self, siteinfo=None):
     self.siteinfo = siteinfo
     self.globalSettings = Settings()
     self.doraemon = Doraemon()
     self.getSettings()
     self.file = FileIOMiddleware()
Exemplo n.º 11
0
class UpdateMonitorFiles():
    def __init__(self, siteinfo=None):
        self.siteinfo = siteinfo
        self.globalSettings = Settings()
        self.doraemon = Doraemon()
        self.getSettings()
        self.file = FileIOMiddleware()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.work_path_prd4 = self.settings.WORK_PATH_PRD1
        self.work_path_prd3 = self.settings.WORK_PATH_PRD2
        self.content_backup_path = self.settings.FINISHED_BACKUP_PATH
        self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH
        self.url_backup_path = self.settings.URL_BACKUP_PATH
        self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH
        self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH
        self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH
        self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL
        self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0
        self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL
        self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0

    def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2,
                      tcount2, turl2, diff2):
        return '<tr>' + \
                    '<th align="center" valign="middle">{0}</th>'.format(siteName) + \
                    '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \
                    '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \
                    '<td align="center" valign="middle">{0}</td>'.format(diff1) + \
                    '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \
                    '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \
                    '<td align="center" valign="middle">{0}</td>'.format(diff2) + \
               '</tr>'

    def updateSite(self, number, title, url):
        return '<tr>' + \
                     '<td align="center" valign="middle">{0}</td>'.format(number) + \
                     '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \
               '</tr>'

    def uploadFile(self, fromFile, toFile):
        while os.path.exists(fromFile):
            try:
                if self.doraemon.sshUpload(
                        self.globalSettings.IP_WEBSERVER0,
                        self.globalSettings.PORT_WEBSERVER0,
                        self.globalSettings.USER_ROOT_WEBSERVER0,
                        self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0,
                        fromFile, toFile):
                    print 'Success to retry to upload monitor file: {0}'.format(
                        fromFile)
                    return True
            except Exception as e:
                print 'Exception {0} to upload monitor site file: {1}'.format(
                    e.message, fromFile)
                return False

    def updateSingleSite(self, preBackupPath, postBackupPath, siteName):
        singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0)
        isPreBackupFileExists = os.path.exists(preBackupPath)
        isPostBackupFileExists = os.path.exists(postBackupPath)
        preCsvContent = None
        if isPreBackupFileExists:
            print "Start to read url back up file: {0}".format(
                self.settings.NAME)
            preCsvContent = self.file.readColsFromCSV(preBackupPath,
                                                      ['title', 'url'])
            singleSiteData.tcount = len(preCsvContent.values)
        else:
            print "Url back up file not exits: {0}".format(self.settings.NAME)
            singleSiteData.tcount = 0

        if isPostBackupFileExists:
            print "Start to read post url back up file: {0}".format(
                self.settings.NAME)
            postCsvContent = self.file.readColsFromCSV(postBackupPath,
                                                       ['title', 'url'])
            singleSiteData.ycount = len(postCsvContent.values)
        else:
            print "Post url back up file not exits: {0}".format(
                self.settings.NAME)
            singleSiteData.ycount = 0
        singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount
        if preCsvContent is not None:
            if preCsvContent.empty:
                print "No new back up url: {0}".format(self.settings.NAME)
            else:
                template = self.file.readFromTxt(
                    self.monitor_site_template_path)
                finalContent = ''
                number = 1
                for item in preCsvContent.values:
                    finalContent = "{0}{1}".format(
                        finalContent, self.updateSite(number, item[1],
                                                      item[0]))
                    number += 1
                template = template.replace(
                    'UpdateTime', self.doraemon.getCurrentLocalTime())
                template = template.replace('ServerName', siteName)
                template = template.replace('SiteName', self.siteinfo.name)
                template = template.replace('MainContent', finalContent)
                turl = '{0}{1}_{2}.html'.format(self.monitor_site_url,
                                                self.settings.NAME, siteName)
                singleSiteData.turl = turl
                uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format(
                    self.monitor_upload_local, self.settings.NAME, siteName)
                self.file.writeToHtmlCover(uploadLocalHtmlPath, template)
        return singleSiteData

    def processAllSites(self, allSitesData=None):
        template = self.file.readFromTxt(self.monitor_spiders_template_path)
        mainContent = ''
        t = totalDto(0, 0, 0, 0, 0, 0)
        for data in allSitesData:
            mainContent = '{0}{1}'.format(
                mainContent,
                self.updateSpiders(data.prd3.sitename, data.prd3.ycount,
                                   data.prd3.tcount, data.prd3.turl,
                                   data.prd3.diff, data.prd4.ycount,
                                   data.prd4.tcount, data.prd4.turl,
                                   data.prd4.diff))
            t.prd3ytotal += data.prd3.ycount
            t.prd3ttotal += data.prd3.tcount
            t.prd4ytotal += data.prd4.ycount
            t.prd4ttotal += data.prd4.tcount
        t.prd3difftotal = t.prd3ttotal - t.prd3ytotal
        t.prd4difftotal = t.prd4ttotal - t.prd4ytotal
        mainContent = '{0}{1}'.format(
            mainContent,
            self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '',
                               t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '',
                               t.prd4difftotal))
        template = template.replace('UpdateTime',
                                    self.doraemon.getCurrentLocalTime())
        template = template.replace('MainContent', mainContent)
        localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local)
        self.file.writeToHtmlCover(localHtmlPath, template)
        self.doraemon.tar(self.monitor_upload_local)
        fromFile = '{0}.tar.gz'.format(self.monitor_upload_local)
        self.uploadFile(
            fromFile,
            '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0))
        os.remove(fromFile)

    def processSingleSite(self):
        spidersContent = allSitesDto(None, None)
        spidersContent.prd3 = self.updateSingleSite(self.url_backup_path,
                                                    self.url_backup_post_path,
                                                    'prd3')
        spidersContent.prd4 = self.updateSingleSite(
            self.content_backup_path, self.content_backup_post_path, 'prd4')
        return spidersContent
Exemplo n.º 12
0
 def __init__(self):
     self.settings = Settings()
     self.file = FileIOMiddleware()
     self.doraemon = Doraemon()
     self.log_path = self.settings.LOG_PATH
     self.doraemon.createFilePath(self.log_path)
Exemplo n.º 13
0
class CamelBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd2 = self.settings.WORK_PATH_PRD2
        self.mongo = self.settings.MONGO_URLS
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_URL
        self.urls = self.settings.URLS
        self.max_concurrency = self.globalSettings.MAX_CONCURRENCY
        self.concurrency_file = self.globalSettings.CONCURRENCY_FILE
        self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH
        self.url_timeout = self.settings.URL_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.url_backup_folder_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        results = self.callBack(current_url, html)
        if len(results) == 0:
            message1 = 'No url for page: {0}'.format(current_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in results:
            is_title_empty = self.doraemon.isEmpty(item.title)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_urls, item.title) is False):
                message2 = 'Start to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeMongodb(
                    self.mongo, self.doraemon.createCamelMongoJson(item))
                message3 = 'End to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message3)
                print message3
                self.file.logger(self.log_path,
                                 'Done for {0}'.format(item.url))
            else:
                if is_title_empty is True:
                    message4 = 'Empty title for {0}'.format(item.url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    print 'Finished title for {0}'.format(item.url)
        print 'End to parse {0}'.format(current_url)

        del current_url, results, html
        gc.collect()

    def start(self, isdebug=False):
        if self.doraemon.isCamelReadyToRun(
                self.settings) is False and isdebug is False:
            message5 = 'It is not ready to run for {0}'.format(self.name)
            print message5
            return
        message6 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return
        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.url_timeout,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file,
                                          self.max_concurrency)
        message7 = 'End for {0} requests of {1}.'.format(
            str(len(content)), self.name)
        self.file.logger(self.log_path, message7)
        print message7

        del new_urls, content, url_list, request
        gc.collect()
Exemplo n.º 14
0
class SpiderBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd1 = self.settings.WORK_PATH_PRD1
        self.finished_txt_path = self.settings.FINISHED_TXT_PATH
        self.finished_html_path = self.settings.FINISHED_HTML_PATH
        self.finished_image_path = self.settings.FINISHED_IMG_PATH
        self.template_path = self.globalSettings.TEMPLATE_PATH
        self.article_url = self.globalSettings.ARTICLE_URL
        self.ali_domain = self.globalSettings.ALI_DOMAIN
        self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS
        self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG
        self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0
        self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0
        self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0
        self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0
        self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0
        self.mongo = self.settings.MONGO
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT
        self.url_path = self.settings.URL_PATH
        self.is_open_cache = self.settings.IS_OPEN_CACHE
        self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH
        self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER
        self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER
        self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL
        self.local_html_path = self.globalSettings.LOCAL_HTML_PATH
        self.content_timeout = self.settings.CONTENT_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.finished_backup_folder_path)
        self.doraemon.createFilePath(self.monitor_upload_local)
        self.doraemon.createFilePath(self.local_html_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        request_title = response['request_title']
        print 'Start to parse: {0}'.format(current_url)
        page_source = response['response'].page_source
        html = etree.HTML(page_source)
        results = None
        try:
            results = self.callBack(current_url, html, page_source)
            if results == None:
                message1 = 'No content for: {0}'.format(current_url)
                print message1
                self.file.logger(self.log_path, message1)
                return
            dataToMongo = self.doraemon.createSpiderMongoJson(results)
        except Exception as e:
            message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message)
            print message1
            self.file.logger(self.log_path, message1)
        print 'End to parse: {0}'.format(current_url)
        if results == None:
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)
            print 'No data for {0}'.format(request_title)
        else:
            message2 = 'Start to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message2)
            print message2
            self.doraemon.storeMongodb(self.mongo, dataToMongo)
            message3 = 'End to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message3)
            print message3
            self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name)
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)

    def start(self):
        if self.doraemon.isSpiderReadyToRun() is False:
            message4 = 'It is not ready to run spider: {0}'.format(self.name)
            print message4
            return
        message5 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message5)
        print message5
        message6 = 'Start requests: {0} '.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path)
        if len(new_url_titles) == 0:
            self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
            message7 = 'No new url for {0}'.format(self.name)
            self.file.logger(self.log_path, message7)
            print message7
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
        message8 = 'End requests for {0}'.format(str(len(content)))
        self.file.logger(self.log_path, message8)
        print message8
        del content, new_url_titles, request
        gc.collect()