示例#1
0
class RefreshRedis():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.request = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.name = settings_name['NAME']
        self.log_path = self.settings.LOG_PATH_PRD2
        self.redis_refresh_path = settings_name['REDIS_REFRESH_PATH']
        self.refresh_redis_interval = self.settings.REFRESH_REDIS_INTERVAL
        self.finished_weixin_url_id = self.settings.FINISHED_WEIXIN_URL_ID

    def start(self):
        if self.doraemon.isExceedRestartInterval(
                self.redis_refresh_path, self.refresh_redis_interval) is False:
            return
        self.file.logger(self.log_path, 'Start refresh redis')
        print 'Start refresh redis'
        key = '{0}0'.format(self.finished_weixin_url_id)
        self.doraemon.delKey(key)
        self.file.logger(self.log_path, 'Finished to refresh redis')
        print 'Finished to refresh redis'
示例#2
0
class SeleniumMiddleware(object):
    def __init__(self):
        self.settings = Settings()
        self.settings.CreateCommonSettings()

    def init(self, timeout=None, executable_path=None, proxy=None):
        self.file = FileIOMiddleware()
        self.timeout = timeout
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        if proxy is not None:
            chrome_options.add_argument('--proxy-server=http://{0}'.format(proxy))
        self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options)
        self.load_timeout = self.browser.set_page_load_timeout(self.timeout)
        self.wait = WebDriverWait(self.browser, self.timeout)

    def close(self):
        self.browser.close()
        self.browser.quit()
        del self.browser, self.file, self.timeout, self.load_timeout, self.wait
        gc.collect()

    def chrome_request(self, url, log_path, proxy):
        self.init(timeout=self.settings.SELENIUM_TIMEOUT, executable_path=self.settings.CHROMEDRIVER_PATH, proxy=proxy)
        try:
            self.file.logger(log_path, 'Starting Chrome for: {0}'.format(url))
            self.browser.get(url)
            return self.browser
        except TimeoutException:
            browser = self.browser
            self.file.logger(log_path, 'Chrome timeout for: {0}'.format(url))
            self.close()
            return browser
示例#3
0
class BrowserRequest():
    def run_task(self, url_title, url_timeout, callback=callable):
        self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0]))
        print 'Start: {0}'.format(url_title[0])
        try:
            request = SeleniumMiddleware()
            request.chrome_request(url_title[0], self.log_path, self.proxy)
            print 'Finish loading: {0}'.format(url_title[0])
            response = request.browser
            callback({
                'response': response,
                'request_url': url_title[0],
                'request_title': url_title[1]
            })
        except Exception as e:
            self.file.logger(
                self.log_path,
                'Exception: {0} for {1}'.format(e.message, url_title[0]))
            print 'Exception: {0} for {1}'.format(e.message, url_title[0])
            response.close()
            response.quit()
            del response, request
            gc.collect()
        self.content.append({
            'current_url': response.current_url,
            'page_source': response.page_source
        })
        self.file.logger(
            self.log_path,
            'End browser request for: {0}'.format(response.current_url))
        print 'End browser request for: {0}'.format(response.current_url)
        response.close()
        response.quit()
        del response, request
        gc.collect()

    def start_chrome(self,
                     url_titles,
                     url_timeout,
                     processes,
                     log_path,
                     proxy,
                     callback=callable):
        self.file = FileIOMiddleware()
        self.content = []
        self.log_path = log_path
        self.proxy = proxy
        process = Pool(processes)
        for url_title in url_titles:
            process.apply_async(self.run_task,
                                args=(url_title, url_timeout, callback))
        process.close()
        process.join()
        self.file.logger(self.log_path, 'Done')
        print 'Done'
        del self.file, process
        gc.collect()
        return self.content
示例#4
0
class Weixin():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('wx')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.log_path = self.settings.LOG_PATH_PRD2
        self.urls = settings_name['URLS']
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']

        self.valid_proxy_name = self.settings.VALID_PROXY_WX_URL

        self.invalid_proxy_name = self.settings.INVALID_PROXY_WX_URL
        self.finished_wx_id = self.settings.FINISHED_WX_ID

        self.finished_wx_aritcle_list_id = self.settings.FINISHED_WX_ARTICLE_LIST_ID

        self.today = self.settings.TODAY

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title']
        href_item = html.xpath(
            ".//*[contains(@class, 'weui_media_title')]/text()")
        if len(href_item) == 0:
            print "No content"
            return
        #     self.doraemon.hashSet(self.invalid_proxy_name, self.proxy, self.proxy)
        #     self.doraemon.delHashSet(self.valid_proxy_name, self.proxy)
        # self.doraemon.hashSet(self.finished_wx_aritcle_list_id, id, id)
        title = ''.join(href_item).strip()
        print title
        print self.count
        self.count += 1
        # self.current_url.pop()
        # print 'Finished for {0} -- id: {1}'.format(id, self.proxy)
        # self.file.logger(self.log_path, 'Finished for {0} -- id: {1}'.format(id, self.proxy))
        # if len(self.new_urls) > 0:
        #     new_url = self.new_urls.pop()
        #     print 'Start next: {0}'.format(new_url[0])
        #     self.current_url.append(new_url)
        # print 'End to parse {0}, url: {1}'.format(id, href_item[0])

    def start_requests(self):
        if self.doraemon.isExceedRestartInterval(
                self.restart_path, self.restart_interval) is False:
            return
        self.file.logger(self.log_path, 'Start {0} requests'.format(self.name))
        print 'Start {0} requests'.format(self.name)
        self.new_urls = []
        self.current_url = []

        all_valid_proxy = list(
            self.doraemon.getAllHasSet(self.valid_proxy_name))
        # self.proxy = all_valid_proxy.pop()
        self.proxy = None
        finished_wx_aritcle_list_id = list(
            self.doraemon.getAllHasSet(self.finished_wx_aritcle_list_id))

        # self.urls_article_list = self.doraemon.getAllHasSet(self.url_pool)

        # for key in self.urls_article_list:
        #     url = self.urls_article_list[key]
        #     if key not in finished_wx_aritcle_list_id:
        #         self.new_urls.append([url, key])
        # if len(self.new_urls) > 0:
        #     self.current_url.append(self.new_urls.pop())
        # else:
        #     print 'No more urls.'
        #     return
        self.current_url = [[
            'https://mp.weixin.qq.com/s?timestamp=1555455810&src=3&ver=1&signature=EHLmXR6NesCs9iuBl0SrFK6wHqPspj7zJIWDfOhXY1JPCjnAD8w469-xLwDFXIrJIiN7G4pLm2FcqrBFvCVobdHrvG9AwsUp5Nt-wvpgazEl2MvQPGi020W*K0Lz3gvQSHWzvnW5Li62GqmNGjGohTdyCy911T*ESQXm7O56CIk=',
            'wx'
        ]]

        self.badkeys = ['None']
        self.goodkeys = ['']
        request = BrowserRequest()
        self.count = 0
        while len(self.current_url) > 0:
            request.start_chrome(self.current_url,
                                 self.max_pool_size,
                                 self.log_path,
                                 self.proxy,
                                 callback=self.parse)

        self.file.logger(self.log_path,
                         'End for requests of {0}.'.format(self.name))
示例#5
0
class SogoAccount():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.requests = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('sogo')
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.log_path = self.settings.LOG_PATH_PRD2
        self.urls = settings_name['URLS']
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.valid_proxy_pool_sogo_account = self.settings.VALID_PROXY_POOL_SOGO_ACCOUNT
        self.invalid_proxy_pool_sogo_account= self.settings.INVALID_PROXY_POOL_SOGO_ACCOUNT
        self.finished_sogo_account = self.settings.FINISHED_SOGO_ACCOUNT
        self.regx = re.compile("[0-9]{1,}.[0-9]{1,}.[0-9]{1,}.[0-9]{1,}:[0-9]{1,}")

    def getProxy(self):
        url = "http://ip.16yun.cn:817/myip/pl/c167cc62-6ad5-4876-bfd8-0cc423dab398/?s=wygafjcqjv&u=hellobee&count=2"
        # url = "http://129.28.124.247:43059/get_ip.php?key=908299fbaefcacef4eb2c9e6ea18c5f2"
        response = self.requests.requests_request(url, headers=None, host="ip.16yun.cn", referer="ip.16yun.cn")
        proxy_list = response.text.strip().split('\n')
        for proxy in proxy_list:
            ip = proxy.strip()
            isValidIp = self.regx.match(ip)
            if self.doraemon.isEmpty(ip) is False and isValidIp is not None:
                self.file.logger(self.log_path, "Proxy: {0} is available.".format(ip))
                print "Proxy: {0} is available.".format(ip)
                try:
                    self.doraemon.hashSet(self.valid_proxy_pool_sogo_account, ip, ip)
                except Exception as e:
                    print "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message)
                    self.file.logger(self.log_path, "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message))
            else:
                self.file.logger(self.log_path, 'Fail to get proxy for sogo account.')
                print "Fail to get proxy for sogo account."

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title']
        href_item = html.xpath(".//*[contains(@uigs, 'account_name_0')]/@href")
        if len(href_item) == 0:
            print 'Blocked and change for another proxy.'
            self.doraemon.hashSet(self.invalid_proxy_pool_sogo_account, self.proxy, self.proxy)
            self.doraemon.delHashSet(self.valid_proxy_pool_sogo_account, self.proxy)
            all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))
            if len(all_valid_proxy) == 0:
                print 'The proxy pool is empty and get proxy again.'
                self.file.logger(self.log_path, 'The proxy pool is empty and get proxy again.')
                self.getProxy()
            all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))
            self.proxy = all_valid_proxy.pop()
            return
        href = href_item[0]
        url = urlparse.urljoin(current_url, href)
        self.doraemon.hashSet(self.name, key, url)
        self.doraemon.hashSet(self.finished_sogo_account, key, key)
        print 'Finished for {0}'.format(key)
        self.current_url.pop()
        if len(self.new_urls) > 0:
            new_url = self.new_urls.pop()
            print 'Start next: {0}'.format(new_url[0])
            self.current_url.append(new_url)
        print 'End to parse {0}, url: {1}'.format(key, href_item[0])

    def start_requests(self):
        if self.doraemon.isExceedRestartInterval(self.restart_path, self.restart_interval) is False:
            return
        self.file.logger(self.log_path, 'Start {0} requests'.format(self.name))
        print 'Start {0} requests'.format(self.name)

        try:
            self.getProxy()
        except Exception as e:
            self.file.logger(self.settings.LOG_PATH, 'Exception to get proxy: {0}'.format(str(e.message)))

        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_sogo_account))
        all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))

        if self.doraemon.isEmpty(all_valid_proxy):
            self.file.logger(self.log_path, 'No available proxy for sogo account and return.')
            print "No available proxy for sogo account and return."
            return

        self.new_urls = []
        self.current_url = []

        keys = []
        content = self.file.readFromTxt(self.urls)
        keys_list = content.split('\n')

        for key in keys_list:
            if self.doraemon.isEmpty(key) is False:
                keys.append(key)

        self.proxy = all_valid_proxy.pop()
        for key in keys:
            if key not in all_finished_id:
                timestamp = '00'.join(str(time.time()).split('.'))
                tmp_url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={0}&ie=utf8&_sug_=n&_sug_type_=".format(key)
                self.new_urls.append([tmp_url, key])

        request = BrowserRequest()
        if len(self.new_urls) > 0:
            self.current_url.append(self.new_urls.pop())
        else:
            print 'No more urls.'
        while len(self.current_url) > 0:
            print "Proxy :{0}".format(self.proxy)
            if len(self.new_urls) > 0:
                self.current_url.append(self.new_urls.pop())
            else:
                print 'No more urls.'
            request.start_chrome(self.current_url, self.max_pool_size, self.log_path, self.proxy, callback=self.parse)

        self.file.logger(self.log_path, 'End for requests of {0}.'.format(self.name))
示例#6
0
class BrowserRequest():
    def run_task(self, url_title, callback=callable):
        self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0]))
        print 'Start: {0}'.format(url_title[0])
        request = SeleniumMiddleware()
        request.chrome_request('https://weixin.sogou.com/', self.log_path,
                               self.proxy)
        time.sleep(3)
        input = request.browser.find_element_by_id("query")
        input.send_keys(url_title[0])
        button = request.browser.find_element_by_class_name("swz2")
        button.click()
        time.sleep(2)
        if self.is_blocked(request.browser):
            print 'The ip: {0} is blocked.'.format(self.proxy)
            request.browser.close()
            request.browser.quit()
            del request.browser
            gc.collect()
            response = []
        else:
            account = request.browser.find_element_by_xpath(
                "//a[@uigs='account_name_0']")
            account.click()
            time.sleep(2)
            request.browser.switch_to_window(request.browser.window_handles[1])
            response = request.browser
        try:
            callback({
                'response': response,
                'request_url': url_title[0],
                'request_title': url_title[1]
            })
        except Exception as e:
            self.file.logger(
                self.log_path,
                'Exception: {0} for {1}'.format(e.message, url_title[0]))
            print 'Exception: {0} for {1}'.format(e.message, url_title[0])
            request.browser.close()
            request.browser.quit()
            del request
            gc.collect()
        self.content.append({
            'current_url': response.current_url,
            'page_source': response.page_source
        })
        self.file.logger(self.log_path,
                         'End: {0}'.format(response.current_url))
        print 'End: {0}'.format(response.current_url)
        response.close()
        response.quit()
        del response, request
        gc.collect()

    def is_blocked(self, browser):
        try:
            browser.find_element_by_id("seccodeImage")
            return True
        except:
            return False

    def start_chrome(self,
                     url_titles,
                     processes,
                     log_path,
                     proxy,
                     callback=callable):
        self.file = FileIOMiddleware()
        self.content = []
        self.log_path = log_path
        self.proxy = proxy
        process = Pool(processes)
        for url_title in url_titles:
            process.apply_async(self.run_task, args=(url_title, callback))
        process.close()
        process.join()
        self.file.logger(self.log_path, 'Done')
        print 'Done'
        del self.file, process
        gc.collect()
        return self.content
示例#7
0
class WeixinTalk():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.transer = FileTransferMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.name = settings_name['NAME']
        self.log_path = self.settings.LOG_PATH
        self.host_name = self.settings.HOST_NAME
        self.user_name = self.settings.USER_NAME
        self.password = self.settings.PASSWORD
        self.port = self.settings.PORT
        self.remote_img_path = self.settings.REMOTE_IMG_PATH
        self.remote_html_path = self.settings.REMOTE_HTML_PATH
        self.max_upload_process = self.settings.MAX_UPLOAD_PROCESS
        self.temp_html_path = self.settings.TEMP_FOLDER_HTML
        self.temp_img_path = self.settings.TEMP_FOLDER_IMG

    def start_upload(self):
        try:
            self.file.logger(self.log_path, 'Start to compress html and img')
            print 'Start to compress html and img'
            self.doraemon.tar(self.temp_html_path)
            self.doraemon.tar(self.temp_img_path)
            self.file.logger(self.log_path, 'Finish to compress html and img')
            print 'Finish to compress html'
        except Exception as e:
            self.file.logger(
                self.log_path,
                'Exception to compress html and img: {0}'.format(e.message))
            print 'Exception to compress html and img: {0}'.format(e.message)

        local_html_tmp_file = "{0}.tar.gz".format(self.temp_html_path)
        remote_html_tmp_file = "{0}/html.tar.gz".format(self.remote_html_path)
        local_img_tmp_file = "{0}.tar.gz".format(self.temp_img_path)
        remote_img_tmp_file = "{0}/img.tar.gz".format(self.remote_img_path)
        if self.doraemon.isFileExists(local_html_tmp_file):
            try:
                self.file.logger(
                    self.log_path,
                    'Start upload html for: {0} '.format(self.name))
                print 'Start upload html for: {0} '.format(self.name)
                self.transer.singleUpload(local_html_tmp_file,
                                          remote_html_tmp_file, self.host_name,
                                          self.user_name, self.password,
                                          self.port)
                self.file.logger(
                    self.log_path,
                    'Finished upload html for: {0} '.format(self.name))
                print 'Finished upload html for: {0} '.format(self.name)
            except Exception as e:
                self.file.logger(
                    self.log_path,
                    'Exception to upload html: {0}'.format(e.message))
                print 'Exception to upload html: {0}'.format(e.message)
        else:
            print 'No html to upload'

        if self.doraemon.isFileExists(local_img_tmp_file):
            try:
                self.file.logger(
                    self.log_path,
                    'Start upload image for: {0} '.format(self.name))
                print 'Start upload image for: {0} '.format(self.name)
                self.transer.singleUpload(local_img_tmp_file,
                                          remote_img_tmp_file, self.host_name,
                                          self.user_name, self.password,
                                          self.port)
                self.file.logger(
                    self.log_path,
                    'Finished upload image for: {0} '.format(self.name))
                print 'Finished upload image for: {0} '.format(self.name)
            except Exception as e:
                self.file.logger(
                    self.log_path,
                    'Exception to upload img: {0}'.format(e.message))
                print 'Exception to upload img: {0}'.format(e.message)
        else:
            print 'No image to upload'
示例#8
0
class Topbaidu():
    def __init__(self):

        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(Settings.LOG_PATH)

    def getSettings(self):
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.log_path = Settings.LOG_PATH_PRD2
        self.urls = settings_name['URLS']
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.today = Settings.TODAY

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        href_items = html.xpath(".//*[contains(@class, 'article-item-title')]")
        for item in href_items:
            href = item.xpath("@href")
            valid = True
            if len(href) == 0:
                continue
            href_url = href[0]
            hasId = str(filter(str.isdigit, href_url))
            if len(hasId) == 0:
                print 'Invalid url for no id: {0}'.format(href_url)
                continue
            for good in self.goodkeys:
                if valid == True:
                    continue
                if good in href_url:
                    valid = True
            for bad in self.badkeys:
                if valid == False:
                    continue
                if bad in href_url:
                    valid = False
            if valid:
                short_url_parts = re.split(r'[., /, _]', href_url)
                id = short_url_parts[len(short_url_parts) - 1]
                url = urlparse.urljoin(current_url, href_url)
                title = ""
                title_list1 = item.xpath(".//text()")
                if len(title_list1) > 0:
                    title = title_list1[0]
                    print title
                is_title_empty = self.doraemon.isEmpty(title)
                if (is_title_empty is False) and (
                        self.doraemon.isDuplicated(title) is False):
                    data = {
                        'title': title.strip(),
                        'url': url.strip(),
                        'id': id.strip(),
                        'download_time': self.today
                    }
                    self.file.logger(
                        self.log_path,
                        'Start to store mongo {0}'.format(data['url']))
                    print 'Start to store mongo {0}'.format(data['url'])
                    self.doraemon.storeMongodb(self.mongo, data)
                    self.file.logger(
                        self.log_path,
                        'End to store mongo {0}'.format(data['url']))
                    print 'End to store mongo {0}'.format(data['url'])
                    self.file.logger(self.log_path, 'Done for {0}'.format(url))
                else:
                    if is_title_empty is True:
                        self.file.logger(self.log_path,
                                         'Empty title for {0}'.format(url))
                        print 'Empty title for {0}'.format(url)
                    print 'Finished or Empty title for {0}'.format(url)
            else:
                self.file.logger(self.log_path, 'Invalid {0}'.format(href_url))
                print 'Invalid {0}'.format(href_url)
        print 'End to parse {0}'.format(href_url)

    def start_requests(self):
        if self.doraemon.isExceedRestartInterval(
                self.restart_path, self.restart_interval) is False:
            return
        self.file.logger(self.log_path, 'Start {0} requests'.format(self.name))
        print 'Start {0} requests'.format(self.name)
        self.badkeys = []
        self.goodkeys = []

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return

        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(
            self.log_path,
            'End for {0} requests of {1}.'.format(str(len(content)),
                                                  self.name))
        print 'End for {0} requests of {1}.'.format(str(len(content)),
                                                    self.name)
示例#9
0
class RequestsMiddleware():
    def __init__(self):
        self.settings = Settings()
        self.settings.CreateCommonSettings()

    def init(self, headers=None, host=None, referer=None):
        self.file = FileIOMiddleware()
        self.requests = requests
        self.headers = headers
        if headers is None:
            self.headers = {}
            self.headers['Accept'] = self.settings.ACCEPT
            self.headers['Accept-Encoding'] = self.settings.ACCEPT_ENC0DING
            self.headers['Accept-Language'] = self.settings.ACCEPT_LANGUAGE
            self.headers['Cache-Control'] = self.settings.CACHE_CONTROL
            self.headers['Connection'] = self.settings.CONNECTION
            self.headers['Host'] = host
            self.headers[
                'Upgrade-Insecure-Requests'] = self.settings.UPGRADE_INSECURE_REQUESTS
            self.headers['Referer'] = referer
            self.headers['Pragma'] = self.settings.PRAGMA
            self.headers['User-Agent'] = self.settings.USER_AGENTS[
                random.randint(0,
                               len(self.settings.USER_AGENTS) - 1)]

    def requests_request(self, url, headers=None, host=None, referer=None):
        self.init(headers=headers, host=host, referer=referer)
        try:
            self.file.logger(self.settings.LOG_PATH, 'Starting Requests')
            res = self.requests.get(url=url, headers=self.headers)
            return res
        except Exception as e:
            self.file.logger(self.settings.LOG_PATH,
                             'Requests Timeout: {0}'.format(str(e.message)))

    def run_task(self,
                 url_title=[],
                 callback=callable,
                 headers=None,
                 host=None):
        self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0]))
        print 'Start: {0}'.format(url_title[0])
        response = self.requests_request(url_title[0], headers, host,
                                         url_title[0])
        try:
            callback({
                'response': response,
                'request_url': url_title[0],
                'request_title': url_title[1]
            })
        except Exception as e:
            self.file.logger(
                self.log_path,
                'Exception: {0} for {1}'.format(e.message, url_title[0]))
            print 'Exception: {0} for {1}'.format(e.message, url_title[0])
            del response, self.requests_request
            gc.collect()
        self.file.logger(self.log_path, 'End: {0}'.format(response.url))
        print 'End: {0}'.format(response.url)
        del response, self.requests_request
        gc.collect()

    def start_requests(self,
                       url_titles,
                       processes,
                       log_path,
                       headers,
                       host,
                       proxy,
                       callback=callable):
        self.file = FileIOMiddleware()
        self.content = []
        self.log_path = log_path
        self.proxy = proxy
        process = Pool(processes)
        for url_title in url_titles:
            process.apply_async(self.run_task,
                                args=(url_title, callback, headers, host))
        process.close()
        process.join()
        self.file.logger(self.log_path, 'Done')
        print 'Done'
        del self.file, process
        gc.collect()
示例#10
0
class Huxiu():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('huxiu')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/'
        self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv'
        self.mongo = 'huxiu_nlp'
        self.name = settings_name['NAME']
        self.max_pool_size = 4
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.is_open_cache = settings_name['IS_OPEN_CACHE']

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        data = {}
        comment_number = ""
        title = ""
        url = ""
        id = ""
        share_number = ""
        image_url = ""
        content = ""
        time = ""
        author_url = ""
        author_name = ""
        valid = False

        url = current_url
        id = str(filter(str.isdigit, current_url.encode('gbk')))
        title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()")
        comment_number1 = html.xpath(
            ".//*[contains(@class, 'article-pl pull-left')]/text()")
        share_number1 = html.xpath(
            ".//*[contains(@class, 'article-share pull-left')]/text()")
        image_url1 = html.xpath(
            ".//*[contains(@class, 'article-img-box')]/img/@src")
        content1 = html.xpath(
            ".//div[contains(@class, 'article-content-wrap')]//text()")
        time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()")
        author_url1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/@href")
        author_name1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/text()")

        if self.doraemon.isEmpty(title1) is False:
            title = title1[0].strip()
        if self.doraemon.isEmpty(comment_number1) is False:
            comment_number = str(
                filter(str.isdigit, comment_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(share_number1) is False:
            share_number = str(
                filter(str.isdigit, share_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(image_url1) is False:
            image_url = image_url1[0].strip()
        if self.doraemon.isEmpty(content1) is False:
            content = ''.join(content1).strip()
            valid = True
        if self.doraemon.isEmpty(time1) is False:
            time = ''.join(time1).strip()
            time = self.doraemon.getDateFromString(time)
        if self.doraemon.isEmpty(author_url1) is False:
            author_url = urlparse.urljoin(current_url, author_url1[0].strip())
        if self.doraemon.isEmpty(author_name1) is False:
            author_name = ''.join(author_name1[0]).strip()

        data = {
            'title': title,
            'comment_number': comment_number,
            'share_number': share_number,
            'image_url': image_url,
            'url': url,
            'public_time': time,
            'author_url': author_url,
            'author_name': author_name,
            'id': id,
            'download_time': self.today,
            'is_open_cache': self.is_open_cache,
            'source': self.source
        }
        print 'End to parse: {0}'.format(current_url)
        if valid == True and self.doraemon.isEmpty(title) is False:
            self.file.logger(self.log_path,
                             'Start to store mongo {0}'.format(data['url']))
            print 'Start to store mongo {0}'.format(data['url'])
            self.doraemon.storeMongodb(self.mongo, data)
            self.file.logger(self.log_path,
                             'End to store mongo {0}'.format(data['url']))
            print 'End to store mongo {0}'.format(data['url'])
            self.doraemon.storeTxt(id, content, self.finished_txt_path,
                                   self.name)
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        else:
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data
        gc.collect()

    def start_requests(self):
        self.file.logger(self.log_path, 'Start request: {0}'.format(self.name))
        print 'Start ' + self.name + ' requests'
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp,
                                                   self.url_path)
        # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']]
        if len(new_url_titles) == 0:
            self.file.logger(self.log_path,
                             'No new url for: {0}'.format(self.name))
            print 'No new url for: {0}'.format(self.name)
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(self.log_path,
                         'End requests: {0}'.format(str(len(content))))
        print 'End requests: {0}'.format(str(len(content)))
        del new_url_titles, request, content
        gc.collect()
示例#11
0
class WeixinSalticidae():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.finished_img_path)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_origin_html_path = settings_name[
            'FINISHED_ORIGIN_HTML_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.mongo = settings_name['MONGO']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE
        self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.regx_img = re.compile('<img(.*?)/>')
        self.regx_date = re.compile(
            '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>'
        )
        self.regx_img_type = re.compile('data-type="(.*?)"')
        self.regx_img_data_src = re.compile('data-src="(.*?)"')
        self.regx_img_src = re.compile('src="(.*?)"')
        self.regx_img_class = re.compile('class="(.*?)"')

    def getPostFixOfImage(self, image_type):
        if image_type == 'jpeg':
            return 'jpg'
        if image_type == 'png':
            return 'png'
        if image_type == 'gif':
            return 'gif'
        else:
            print 'Other type: {0}'.format(image_type)

    def start_requests(self):
        self.file.logger(self.log_path,
                         'Start dowload images for: {0} '.format(self.name))
        print 'Start dowload images for: {0} '.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_image_id, self.finished_content_path)
        if len(new_ids) == 0:
            self.file.logger(self.log_path,
                             'No new image id for {0}'.format(self.name))
            print 'No new image id for {0}'.format(self.name)
            return
        self.doraemon.createFilePath(self.finished_processed_html_path)
        self.doraemon.createFilePath(self.finished_img_path)
        for id in new_ids:
            print 'Start to remove pictures in: {0}'.format(id)
            html_file = self.file.readFromHtml("{0}/{1}.html".format(
                self.finished_origin_html_path, id))
            img_list = re.findall(self.regx_img, html_file)
            date_list = re.findall(self.regx_date, html_file)
            new_html = ''
            number = 0
            for old_time in date_list:
                new_date = self.doraemon.getDateFromString(old_time)
                old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    old_time)
                new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    new_date)
                new_html = html_file.replace(old_time_content,
                                             new_time_content)
                html_file = new_html
            for img in img_list:
                old_img = img
                image_id = "{0}_{1}".format(id, number)
                image_data_src = ''.join(
                    re.findall(self.regx_img_data_src, img)).strip()
                image_src = re.findall(self.regx_img_src, img)
                image_type = ''.join(re.findall(self.regx_img_type,
                                                img)).strip()
                image_post_fix = self.getPostFixOfImage(image_type)
                if (self.doraemon.isEmpty(image_data_src) is True) or \
                   (self.doraemon.isEmpty(image_src) is True) or \
                   (self.doraemon.isEmpty(image_type) is True):
                    continue
                origin_image_path = "{0}/{1}.{2}".format(
                    self.finished_img_path, image_id, image_post_fix)
                print 'Start to download image: {0}'.format(image_data_src)
                self.doraemon.downloadImage(image_data_src, origin_image_path)
                image_size = self.doraemon.getFileSize(origin_image_path)
                if image_size > 60:
                    print 'Start to compress image: {0}'.format(image_data_src)
                    self.doraemon.compressImage(origin_image_path,
                                                origin_image_path, 2)
                    print 'Finished to compress image: {0}'.format(
                        image_data_src)
                print 'Finished to download image: {0}'.format(image_data_src)
                print 'Start to replace image url: {0}'.format(image_id)
                new_imgurl = "{0}{1}.{2}".format(
                    self.url_deepinews_10002_image, image_id, image_post_fix)
                # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix)
                src_list = re.findall(self.regx_img_src, img)
                img_class_list = re.findall(self.regx_img_class, img)
                for img_class in img_class_list:
                    new_img = img.replace(img_class, 'rich_pages')
                    img = new_img
                for src in src_list:
                    new_img = img.replace(src, new_imgurl)
                    img = new_img
                new_html = html_file.replace(old_img, img)
                html_file = new_html
                print 'Finished to replace image url: {0}'.format(image_id)
                number += 1
            self.doraemon.storeHtml(id, new_html,
                                    self.finished_processed_html_path)
            self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)
示例#12
0
class TransferToProduction():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.request = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.temp_folder_html)
        self.doraemon.createFilePath(self.temp_folder_img)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.temp_folder_html = self.settings.TEMP_FOLDER_HTML
        self.temp_folder_img = self.settings.TEMP_FOLDER_IMG
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY

    def start_transfer(self):
        print 'Start {0} transfer'.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_temp_weixin, self.finished_content_path)
        for id in new_ids:
            self.file.logger(self.log_path,
                             'Start transfer image: {0}'.format(id))
            regx_img_file = re.compile(id)
            for f in os.listdir(self.finished_img_path):
                isValidImage = regx_img_file.match(f)
                if isValidImage is None:
                    print 'Invalid image for not match: {0}'.format(f)
                    continue
                from_img_path = "{0}/{1}".format(self.finished_img_path, f)
                to_img_path = "{0}/{1}".format(self.temp_folder_img, f)
                is_from_path_exists = os.path.exists(from_img_path)
                if is_from_path_exists is False:
                    self.file.logger(self.log_path,
                                     'img of {0} not exits.'.format(f))
                    continue
                copyfile(from_img_path, to_img_path)
                print 'Finished to transfer image {0}'.format(f)
            self.file.logger(self.log_path,
                             'Start transfer html: {0}'.format(id))
            from_path = "{0}/{1}.html".format(
                self.finished_processed_html_path, id)
            to_path = "{0}/{1}.html".format(self.temp_folder_html, id)
            is_from_path_exists = os.path.exists(from_path)
            if is_from_path_exists is False:
                self.file.logger(self.log_path,
                                 'html of {0} not exits.'.format(id))
                continue
            copyfile(from_path, to_path)
            print 'Finished to transfer html {0}'.format(id)
            self.doraemon.storeFinished(self.doraemon.bf_finished_temp_weixin,
                                        id)
            print 'Finished to transfer {0}'.format(id)