def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.request = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.settings.LOG_PATH)
def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH)
class SeleniumMiddleware(object): def __init__(self): self.settings = Settings() self.settings.CreateCommonSettings() def init(self, timeout=None, executable_path=None, proxy=None): self.file = FileIOMiddleware() self.timeout = timeout chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') if proxy is not None: chrome_options.add_argument('--proxy-server=http://{0}'.format(proxy)) self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options) self.load_timeout = self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout) def close(self): self.browser.close() self.browser.quit() del self.browser, self.file, self.timeout, self.load_timeout, self.wait gc.collect() def chrome_request(self, url, log_path, proxy): self.init(timeout=self.settings.SELENIUM_TIMEOUT, executable_path=self.settings.CHROMEDRIVER_PATH, proxy=proxy) try: self.file.logger(log_path, 'Starting Chrome for: {0}'.format(url)) self.browser.get(url) return self.browser except TimeoutException: browser = self.browser self.file.logger(log_path, 'Chrome timeout for: {0}'.format(url)) self.close() return browser
def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.transer = FileTransferMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.settings.LOG_PATH)
class RefreshRedis(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.request = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.name = settings_name['NAME'] self.log_path = self.settings.LOG_PATH_PRD2 self.redis_refresh_path = settings_name['REDIS_REFRESH_PATH'] self.refresh_redis_interval = self.settings.REFRESH_REDIS_INTERVAL self.finished_weixin_url_id = self.settings.FINISHED_WEIXIN_URL_ID def start(self): if self.doraemon.isExceedRestartInterval( self.redis_refresh_path, self.refresh_redis_interval) is False: return self.file.logger(self.log_path, 'Start refresh redis') print 'Start refresh redis' key = '{0}0'.format(self.finished_weixin_url_id) self.doraemon.delKey(key) self.file.logger(self.log_path, 'Finished to refresh redis') print 'Finished to refresh redis'
class BrowserRequest(): def run_task(self, url_title, url_timeout, callback=callable): self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0])) print 'Start: {0}'.format(url_title[0]) try: request = SeleniumMiddleware() request.chrome_request(url_title[0], self.log_path, self.proxy) print 'Finish loading: {0}'.format(url_title[0]) response = request.browser callback({ 'response': response, 'request_url': url_title[0], 'request_title': url_title[1] }) except Exception as e: self.file.logger( self.log_path, 'Exception: {0} for {1}'.format(e.message, url_title[0])) print 'Exception: {0} for {1}'.format(e.message, url_title[0]) response.close() response.quit() del response, request gc.collect() self.content.append({ 'current_url': response.current_url, 'page_source': response.page_source }) self.file.logger( self.log_path, 'End browser request for: {0}'.format(response.current_url)) print 'End browser request for: {0}'.format(response.current_url) response.close() response.quit() del response, request gc.collect() def start_chrome(self, url_titles, url_timeout, processes, log_path, proxy, callback=callable): self.file = FileIOMiddleware() self.content = [] self.log_path = log_path self.proxy = proxy process = Pool(processes) for url_title in url_titles: process.apply_async(self.run_task, args=(url_title, url_timeout, callback)) process.close() process.join() self.file.logger(self.log_path, 'Done') print 'Done' del self.file, process gc.collect() return self.content
def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.request = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH) self.doraemon.createFilePath(self.temp_folder_html) self.doraemon.createFilePath(self.temp_folder_img)
def init(self, timeout=None, executable_path=None, proxy=None): self.file = FileIOMiddleware() self.timeout = timeout chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') if proxy is not None: chrome_options.add_argument('--proxy-server=http://{0}'.format(proxy)) self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options) self.load_timeout = self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout)
def init(self, headers=None, host=None, referer=None): self.file = FileIOMiddleware() self.requests = requests self.headers = headers if headers is None: self.headers = {} self.headers['Accept'] = self.settings.ACCEPT self.headers['Accept-Encoding'] = self.settings.ACCEPT_ENC0DING self.headers['Accept-Language'] = self.settings.ACCEPT_LANGUAGE self.headers['Cache-Control'] = self.settings.CACHE_CONTROL self.headers['Connection'] = self.settings.CONNECTION self.headers['Host'] = host self.headers[ 'Upgrade-Insecure-Requests'] = self.settings.UPGRADE_INSECURE_REQUESTS self.headers['Referer'] = referer self.headers['Pragma'] = self.settings.PRAGMA self.headers['User-Agent'] = self.settings.USER_AGENTS[ random.randint(0, len(self.settings.USER_AGENTS) - 1)]
def start_chrome(self, url_titles, processes, log_path, proxy, callback=callable): self.file = FileIOMiddleware() self.content = [] self.log_path = log_path self.proxy = proxy process = Pool(processes) for url_title in url_titles: process.apply_async(self.run_task, args=(url_title, callback)) process.close() process.join() self.file.logger(self.log_path, 'Done') print 'Done' del self.file, process gc.collect() return self.content
class BrowserRequest(): def run_task(self, url_title, callback=callable): self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0])) print 'Start: {0}'.format(url_title[0]) request = SeleniumMiddleware() request.chrome_request('https://weixin.sogou.com/', self.log_path, self.proxy) time.sleep(3) input = request.browser.find_element_by_id("query") input.send_keys(url_title[0]) button = request.browser.find_element_by_class_name("swz2") button.click() time.sleep(2) if self.is_blocked(request.browser): print 'The ip: {0} is blocked.'.format(self.proxy) request.browser.close() request.browser.quit() del request.browser gc.collect() response = [] else: account = request.browser.find_element_by_xpath( "//a[@uigs='account_name_0']") account.click() time.sleep(2) request.browser.switch_to_window(request.browser.window_handles[1]) response = request.browser try: callback({ 'response': response, 'request_url': url_title[0], 'request_title': url_title[1] }) except Exception as e: self.file.logger( self.log_path, 'Exception: {0} for {1}'.format(e.message, url_title[0])) print 'Exception: {0} for {1}'.format(e.message, url_title[0]) request.browser.close() request.browser.quit() del request gc.collect() self.content.append({ 'current_url': response.current_url, 'page_source': response.page_source }) self.file.logger(self.log_path, 'End: {0}'.format(response.current_url)) print 'End: {0}'.format(response.current_url) response.close() response.quit() del response, request gc.collect() def is_blocked(self, browser): try: browser.find_element_by_id("seccodeImage") return True except: return False def start_chrome(self, url_titles, processes, log_path, proxy, callback=callable): self.file = FileIOMiddleware() self.content = [] self.log_path = log_path self.proxy = proxy process = Pool(processes) for url_title in url_titles: process.apply_async(self.run_task, args=(url_title, callback)) process.close() process.join() self.file.logger(self.log_path, 'Done') print 'Done' del self.file, process gc.collect() return self.content
class WeixinTalk(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.transer = FileTransferMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.name = settings_name['NAME'] self.log_path = self.settings.LOG_PATH self.host_name = self.settings.HOST_NAME self.user_name = self.settings.USER_NAME self.password = self.settings.PASSWORD self.port = self.settings.PORT self.remote_img_path = self.settings.REMOTE_IMG_PATH self.remote_html_path = self.settings.REMOTE_HTML_PATH self.max_upload_process = self.settings.MAX_UPLOAD_PROCESS self.temp_html_path = self.settings.TEMP_FOLDER_HTML self.temp_img_path = self.settings.TEMP_FOLDER_IMG def start_upload(self): try: self.file.logger(self.log_path, 'Start to compress html and img') print 'Start to compress html and img' self.doraemon.tar(self.temp_html_path) self.doraemon.tar(self.temp_img_path) self.file.logger(self.log_path, 'Finish to compress html and img') print 'Finish to compress html' except Exception as e: self.file.logger( self.log_path, 'Exception to compress html and img: {0}'.format(e.message)) print 'Exception to compress html and img: {0}'.format(e.message) local_html_tmp_file = "{0}.tar.gz".format(self.temp_html_path) remote_html_tmp_file = "{0}/html.tar.gz".format(self.remote_html_path) local_img_tmp_file = "{0}.tar.gz".format(self.temp_img_path) remote_img_tmp_file = "{0}/img.tar.gz".format(self.remote_img_path) if self.doraemon.isFileExists(local_html_tmp_file): try: self.file.logger( self.log_path, 'Start upload html for: {0} '.format(self.name)) print 'Start upload html for: {0} '.format(self.name) self.transer.singleUpload(local_html_tmp_file, remote_html_tmp_file, self.host_name, self.user_name, self.password, self.port) self.file.logger( self.log_path, 'Finished upload html for: {0} '.format(self.name)) print 'Finished upload html for: {0} '.format(self.name) except Exception as e: self.file.logger( self.log_path, 'Exception to upload html: {0}'.format(e.message)) print 'Exception to upload html: {0}'.format(e.message) else: print 'No html to upload' if self.doraemon.isFileExists(local_img_tmp_file): try: self.file.logger( self.log_path, 'Start upload image for: {0} '.format(self.name)) print 'Start upload image for: {0} '.format(self.name) self.transer.singleUpload(local_img_tmp_file, remote_img_tmp_file, self.host_name, self.user_name, self.password, self.port) self.file.logger( self.log_path, 'Finished upload image for: {0} '.format(self.name)) print 'Finished upload image for: {0} '.format(self.name) except Exception as e: self.file.logger( self.log_path, 'Exception to upload img: {0}'.format(e.message)) print 'Exception to upload img: {0}'.format(e.message) else: print 'No image to upload'
class Topbaidu(): def __init__(self): self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(Settings.LOG_PATH) def getSettings(self): self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.log_path = Settings.LOG_PATH_PRD2 self.urls = settings_name['URLS'] self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.today = Settings.TODAY def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) href_items = html.xpath(".//*[contains(@class, 'article-item-title')]") for item in href_items: href = item.xpath("@href") valid = True if len(href) == 0: continue href_url = href[0] hasId = str(filter(str.isdigit, href_url)) if len(hasId) == 0: print 'Invalid url for no id: {0}'.format(href_url) continue for good in self.goodkeys: if valid == True: continue if good in href_url: valid = True for bad in self.badkeys: if valid == False: continue if bad in href_url: valid = False if valid: short_url_parts = re.split(r'[., /, _]', href_url) id = short_url_parts[len(short_url_parts) - 1] url = urlparse.urljoin(current_url, href_url) title = "" title_list1 = item.xpath(".//text()") if len(title_list1) > 0: title = title_list1[0] print title is_title_empty = self.doraemon.isEmpty(title) if (is_title_empty is False) and ( self.doraemon.isDuplicated(title) is False): data = { 'title': title.strip(), 'url': url.strip(), 'id': id.strip(), 'download_time': self.today } self.file.logger( self.log_path, 'Start to store mongo {0}'.format(data['url'])) print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) self.file.logger( self.log_path, 'End to store mongo {0}'.format(data['url'])) print 'End to store mongo {0}'.format(data['url']) self.file.logger(self.log_path, 'Done for {0}'.format(url)) else: if is_title_empty is True: self.file.logger(self.log_path, 'Empty title for {0}'.format(url)) print 'Empty title for {0}'.format(url) print 'Finished or Empty title for {0}'.format(url) else: self.file.logger(self.log_path, 'Invalid {0}'.format(href_url)) print 'Invalid {0}'.format(href_url) print 'End to parse {0}'.format(href_url) def start_requests(self): if self.doraemon.isExceedRestartInterval( self.restart_path, self.restart_interval) is False: return self.file.logger(self.log_path, 'Start {0} requests'.format(self.name)) print 'Start {0} requests'.format(self.name) self.badkeys = [] self.goodkeys = [] new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.max_pool_size, self.log_path, None, callback=self.parse) self.file.logger( self.log_path, 'End for {0} requests of {1}.'.format(str(len(content)), self.name)) print 'End for {0} requests of {1}.'.format(str(len(content)), self.name)
def init(self): self.getHourMinute() self.getDate() self.file = FileIOMiddleware() self.table_path = Settings.CHRONUS_SETTINGS self.base_path = Settings.RSYNC_PRD1
class Weixin(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('wx') self.source = settings_name['SOURCE_NAME'] self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.log_path = self.settings.LOG_PATH_PRD2 self.urls = settings_name['URLS'] self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.valid_proxy_name = self.settings.VALID_PROXY_WX_URL self.invalid_proxy_name = self.settings.INVALID_PROXY_WX_URL self.finished_wx_id = self.settings.FINISHED_WX_ID self.finished_wx_aritcle_list_id = self.settings.FINISHED_WX_ARTICLE_LIST_ID self.today = self.settings.TODAY def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'] href_item = html.xpath( ".//*[contains(@class, 'weui_media_title')]/text()") if len(href_item) == 0: print "No content" return # self.doraemon.hashSet(self.invalid_proxy_name, self.proxy, self.proxy) # self.doraemon.delHashSet(self.valid_proxy_name, self.proxy) # self.doraemon.hashSet(self.finished_wx_aritcle_list_id, id, id) title = ''.join(href_item).strip() print title print self.count self.count += 1 # self.current_url.pop() # print 'Finished for {0} -- id: {1}'.format(id, self.proxy) # self.file.logger(self.log_path, 'Finished for {0} -- id: {1}'.format(id, self.proxy)) # if len(self.new_urls) > 0: # new_url = self.new_urls.pop() # print 'Start next: {0}'.format(new_url[0]) # self.current_url.append(new_url) # print 'End to parse {0}, url: {1}'.format(id, href_item[0]) def start_requests(self): if self.doraemon.isExceedRestartInterval( self.restart_path, self.restart_interval) is False: return self.file.logger(self.log_path, 'Start {0} requests'.format(self.name)) print 'Start {0} requests'.format(self.name) self.new_urls = [] self.current_url = [] all_valid_proxy = list( self.doraemon.getAllHasSet(self.valid_proxy_name)) # self.proxy = all_valid_proxy.pop() self.proxy = None finished_wx_aritcle_list_id = list( self.doraemon.getAllHasSet(self.finished_wx_aritcle_list_id)) # self.urls_article_list = self.doraemon.getAllHasSet(self.url_pool) # for key in self.urls_article_list: # url = self.urls_article_list[key] # if key not in finished_wx_aritcle_list_id: # self.new_urls.append([url, key]) # if len(self.new_urls) > 0: # self.current_url.append(self.new_urls.pop()) # else: # print 'No more urls.' # return self.current_url = [[ 'https://mp.weixin.qq.com/s?timestamp=1555455810&src=3&ver=1&signature=EHLmXR6NesCs9iuBl0SrFK6wHqPspj7zJIWDfOhXY1JPCjnAD8w469-xLwDFXIrJIiN7G4pLm2FcqrBFvCVobdHrvG9AwsUp5Nt-wvpgazEl2MvQPGi020W*K0Lz3gvQSHWzvnW5Li62GqmNGjGohTdyCy911T*ESQXm7O56CIk=', 'wx' ]] self.badkeys = ['None'] self.goodkeys = [''] request = BrowserRequest() self.count = 0 while len(self.current_url) > 0: request.start_chrome(self.current_url, self.max_pool_size, self.log_path, self.proxy, callback=self.parse) self.file.logger(self.log_path, 'End for requests of {0}.'.format(self.name))
def __init__(self): self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon()
class Huxiu(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('huxiu') self.source = settings_name['SOURCE_NAME'] self.work_path_prd1 = settings_name['WORK_PATH_PRD1'] self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/' self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv' self.mongo = 'huxiu_nlp' self.name = settings_name['NAME'] self.max_pool_size = 4 self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY self.is_open_cache = settings_name['IS_OPEN_CACHE'] def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) data = {} comment_number = "" title = "" url = "" id = "" share_number = "" image_url = "" content = "" time = "" author_url = "" author_name = "" valid = False url = current_url id = str(filter(str.isdigit, current_url.encode('gbk'))) title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()") comment_number1 = html.xpath( ".//*[contains(@class, 'article-pl pull-left')]/text()") share_number1 = html.xpath( ".//*[contains(@class, 'article-share pull-left')]/text()") image_url1 = html.xpath( ".//*[contains(@class, 'article-img-box')]/img/@src") content1 = html.xpath( ".//div[contains(@class, 'article-content-wrap')]//text()") time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()") author_url1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/@href") author_name1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/text()") if self.doraemon.isEmpty(title1) is False: title = title1[0].strip() if self.doraemon.isEmpty(comment_number1) is False: comment_number = str( filter(str.isdigit, comment_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(share_number1) is False: share_number = str( filter(str.isdigit, share_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(image_url1) is False: image_url = image_url1[0].strip() if self.doraemon.isEmpty(content1) is False: content = ''.join(content1).strip() valid = True if self.doraemon.isEmpty(time1) is False: time = ''.join(time1).strip() time = self.doraemon.getDateFromString(time) if self.doraemon.isEmpty(author_url1) is False: author_url = urlparse.urljoin(current_url, author_url1[0].strip()) if self.doraemon.isEmpty(author_name1) is False: author_name = ''.join(author_name1[0]).strip() data = { 'title': title, 'comment_number': comment_number, 'share_number': share_number, 'image_url': image_url, 'url': url, 'public_time': time, 'author_url': author_url, 'author_name': author_name, 'id': id, 'download_time': self.today, 'is_open_cache': self.is_open_cache, 'source': self.source } print 'End to parse: {0}'.format(current_url) if valid == True and self.doraemon.isEmpty(title) is False: self.file.logger(self.log_path, 'Start to store mongo {0}'.format(data['url'])) print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) self.file.logger(self.log_path, 'End to store mongo {0}'.format(data['url'])) print 'End to store mongo {0}'.format(data['url']) self.doraemon.storeTxt(id, content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) else: self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data gc.collect() def start_requests(self): self.file.logger(self.log_path, 'Start request: {0}'.format(self.name)) print 'Start ' + self.name + ' requests' new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp, self.url_path) # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']] if len(new_url_titles) == 0: self.file.logger(self.log_path, 'No new url for: {0}'.format(self.name)) print 'No new url for: {0}'.format(self.name) return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.max_pool_size, self.log_path, None, callback=self.parse) self.file.logger(self.log_path, 'End requests: {0}'.format(str(len(content)))) print 'End requests: {0}'.format(str(len(content))) del new_url_titles, request, content gc.collect()
class WeixinSalticidae(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.settings.LOG_PATH) self.doraemon.createFilePath(self.finished_img_path) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.source = settings_name['SOURCE_NAME'] self.work_path_prd1 = settings_name['WORK_PATH_PRD1'] self.finished_img_path = settings_name['FINISHED_IMG_PATH'] self.finished_origin_html_path = settings_name[ 'FINISHED_ORIGIN_HTML_PATH'] self.finished_processed_html_path = settings_name[ 'FINISHED_PROCESSED_HTML_PATH'] self.finished_content_path = settings_name['FINISHED_CONTENT_PATH'] self.mongo = settings_name['MONGO'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.regx_img = re.compile('<img(.*?)/>') self.regx_date = re.compile( '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>' ) self.regx_img_type = re.compile('data-type="(.*?)"') self.regx_img_data_src = re.compile('data-src="(.*?)"') self.regx_img_src = re.compile('src="(.*?)"') self.regx_img_class = re.compile('class="(.*?)"') def getPostFixOfImage(self, image_type): if image_type == 'jpeg': return 'jpg' if image_type == 'png': return 'png' if image_type == 'gif': return 'gif' else: print 'Other type: {0}'.format(image_type) def start_requests(self): self.file.logger(self.log_path, 'Start dowload images for: {0} '.format(self.name)) print 'Start dowload images for: {0} '.format(self.name) new_ids = self.doraemon.readNewImageIds( self.doraemon.bf_finished_image_id, self.finished_content_path) if len(new_ids) == 0: self.file.logger(self.log_path, 'No new image id for {0}'.format(self.name)) print 'No new image id for {0}'.format(self.name) return self.doraemon.createFilePath(self.finished_processed_html_path) self.doraemon.createFilePath(self.finished_img_path) for id in new_ids: print 'Start to remove pictures in: {0}'.format(id) html_file = self.file.readFromHtml("{0}/{1}.html".format( self.finished_origin_html_path, id)) img_list = re.findall(self.regx_img, html_file) date_list = re.findall(self.regx_date, html_file) new_html = '' number = 0 for old_time in date_list: new_date = self.doraemon.getDateFromString(old_time) old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format( old_time) new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format( new_date) new_html = html_file.replace(old_time_content, new_time_content) html_file = new_html for img in img_list: old_img = img image_id = "{0}_{1}".format(id, number) image_data_src = ''.join( re.findall(self.regx_img_data_src, img)).strip() image_src = re.findall(self.regx_img_src, img) image_type = ''.join(re.findall(self.regx_img_type, img)).strip() image_post_fix = self.getPostFixOfImage(image_type) if (self.doraemon.isEmpty(image_data_src) is True) or \ (self.doraemon.isEmpty(image_src) is True) or \ (self.doraemon.isEmpty(image_type) is True): continue origin_image_path = "{0}/{1}.{2}".format( self.finished_img_path, image_id, image_post_fix) print 'Start to download image: {0}'.format(image_data_src) self.doraemon.downloadImage(image_data_src, origin_image_path) image_size = self.doraemon.getFileSize(origin_image_path) if image_size > 60: print 'Start to compress image: {0}'.format(image_data_src) self.doraemon.compressImage(origin_image_path, origin_image_path, 2) print 'Finished to compress image: {0}'.format( image_data_src) print 'Finished to download image: {0}'.format(image_data_src) print 'Start to replace image url: {0}'.format(image_id) new_imgurl = "{0}{1}.{2}".format( self.url_deepinews_10002_image, image_id, image_post_fix) # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix) src_list = re.findall(self.regx_img_src, img) img_class_list = re.findall(self.regx_img_class, img) for img_class in img_class_list: new_img = img.replace(img_class, 'rich_pages') img = new_img for src in src_list: new_img = img.replace(src, new_imgurl) img = new_img new_html = html_file.replace(old_img, img) html_file = new_html print 'Finished to replace image url: {0}'.format(image_id) number += 1 self.doraemon.storeHtml(id, new_html, self.finished_processed_html_path) self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)
class SogoAccount(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.requests = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('sogo') self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.log_path = self.settings.LOG_PATH_PRD2 self.urls = settings_name['URLS'] self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.valid_proxy_pool_sogo_account = self.settings.VALID_PROXY_POOL_SOGO_ACCOUNT self.invalid_proxy_pool_sogo_account= self.settings.INVALID_PROXY_POOL_SOGO_ACCOUNT self.finished_sogo_account = self.settings.FINISHED_SOGO_ACCOUNT self.regx = re.compile("[0-9]{1,}.[0-9]{1,}.[0-9]{1,}.[0-9]{1,}:[0-9]{1,}") def getProxy(self): url = "http://ip.16yun.cn:817/myip/pl/c167cc62-6ad5-4876-bfd8-0cc423dab398/?s=wygafjcqjv&u=hellobee&count=2" # url = "http://129.28.124.247:43059/get_ip.php?key=908299fbaefcacef4eb2c9e6ea18c5f2" response = self.requests.requests_request(url, headers=None, host="ip.16yun.cn", referer="ip.16yun.cn") proxy_list = response.text.strip().split('\n') for proxy in proxy_list: ip = proxy.strip() isValidIp = self.regx.match(ip) if self.doraemon.isEmpty(ip) is False and isValidIp is not None: self.file.logger(self.log_path, "Proxy: {0} is available.".format(ip)) print "Proxy: {0} is available.".format(ip) try: self.doraemon.hashSet(self.valid_proxy_pool_sogo_account, ip, ip) except Exception as e: print "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message) self.file.logger(self.log_path, "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message)) else: self.file.logger(self.log_path, 'Fail to get proxy for sogo account.') print "Fail to get proxy for sogo account." def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'] href_item = html.xpath(".//*[contains(@uigs, 'account_name_0')]/@href") if len(href_item) == 0: print 'Blocked and change for another proxy.' self.doraemon.hashSet(self.invalid_proxy_pool_sogo_account, self.proxy, self.proxy) self.doraemon.delHashSet(self.valid_proxy_pool_sogo_account, self.proxy) all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account)) if len(all_valid_proxy) == 0: print 'The proxy pool is empty and get proxy again.' self.file.logger(self.log_path, 'The proxy pool is empty and get proxy again.') self.getProxy() all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account)) self.proxy = all_valid_proxy.pop() return href = href_item[0] url = urlparse.urljoin(current_url, href) self.doraemon.hashSet(self.name, key, url) self.doraemon.hashSet(self.finished_sogo_account, key, key) print 'Finished for {0}'.format(key) self.current_url.pop() if len(self.new_urls) > 0: new_url = self.new_urls.pop() print 'Start next: {0}'.format(new_url[0]) self.current_url.append(new_url) print 'End to parse {0}, url: {1}'.format(key, href_item[0]) def start_requests(self): if self.doraemon.isExceedRestartInterval(self.restart_path, self.restart_interval) is False: return self.file.logger(self.log_path, 'Start {0} requests'.format(self.name)) print 'Start {0} requests'.format(self.name) try: self.getProxy() except Exception as e: self.file.logger(self.settings.LOG_PATH, 'Exception to get proxy: {0}'.format(str(e.message))) all_finished_id = list(self.doraemon.getAllHasSet(self.finished_sogo_account)) all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account)) if self.doraemon.isEmpty(all_valid_proxy): self.file.logger(self.log_path, 'No available proxy for sogo account and return.') print "No available proxy for sogo account and return." return self.new_urls = [] self.current_url = [] keys = [] content = self.file.readFromTxt(self.urls) keys_list = content.split('\n') for key in keys_list: if self.doraemon.isEmpty(key) is False: keys.append(key) self.proxy = all_valid_proxy.pop() for key in keys: if key not in all_finished_id: timestamp = '00'.join(str(time.time()).split('.')) tmp_url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={0}&ie=utf8&_sug_=n&_sug_type_=".format(key) self.new_urls.append([tmp_url, key]) request = BrowserRequest() if len(self.new_urls) > 0: self.current_url.append(self.new_urls.pop()) else: print 'No more urls.' while len(self.current_url) > 0: print "Proxy :{0}".format(self.proxy) if len(self.new_urls) > 0: self.current_url.append(self.new_urls.pop()) else: print 'No more urls.' request.start_chrome(self.current_url, self.max_pool_size, self.log_path, self.proxy, callback=self.parse) self.file.logger(self.log_path, 'End for requests of {0}.'.format(self.name))
class RequestsMiddleware(): def __init__(self): self.settings = Settings() self.settings.CreateCommonSettings() def init(self, headers=None, host=None, referer=None): self.file = FileIOMiddleware() self.requests = requests self.headers = headers if headers is None: self.headers = {} self.headers['Accept'] = self.settings.ACCEPT self.headers['Accept-Encoding'] = self.settings.ACCEPT_ENC0DING self.headers['Accept-Language'] = self.settings.ACCEPT_LANGUAGE self.headers['Cache-Control'] = self.settings.CACHE_CONTROL self.headers['Connection'] = self.settings.CONNECTION self.headers['Host'] = host self.headers[ 'Upgrade-Insecure-Requests'] = self.settings.UPGRADE_INSECURE_REQUESTS self.headers['Referer'] = referer self.headers['Pragma'] = self.settings.PRAGMA self.headers['User-Agent'] = self.settings.USER_AGENTS[ random.randint(0, len(self.settings.USER_AGENTS) - 1)] def requests_request(self, url, headers=None, host=None, referer=None): self.init(headers=headers, host=host, referer=referer) try: self.file.logger(self.settings.LOG_PATH, 'Starting Requests') res = self.requests.get(url=url, headers=self.headers) return res except Exception as e: self.file.logger(self.settings.LOG_PATH, 'Requests Timeout: {0}'.format(str(e.message))) def run_task(self, url_title=[], callback=callable, headers=None, host=None): self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0])) print 'Start: {0}'.format(url_title[0]) response = self.requests_request(url_title[0], headers, host, url_title[0]) try: callback({ 'response': response, 'request_url': url_title[0], 'request_title': url_title[1] }) except Exception as e: self.file.logger( self.log_path, 'Exception: {0} for {1}'.format(e.message, url_title[0])) print 'Exception: {0} for {1}'.format(e.message, url_title[0]) del response, self.requests_request gc.collect() self.file.logger(self.log_path, 'End: {0}'.format(response.url)) print 'End: {0}'.format(response.url) del response, self.requests_request gc.collect() def start_requests(self, url_titles, processes, log_path, headers, host, proxy, callback=callable): self.file = FileIOMiddleware() self.content = [] self.log_path = log_path self.proxy = proxy process = Pool(processes) for url_title in url_titles: process.apply_async(self.run_task, args=(url_title, callback, headers, host)) process.close() process.join() self.file.logger(self.log_path, 'Done') print 'Done' del self.file, process gc.collect()
class TransferToProduction(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.request = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH) self.doraemon.createFilePath(self.temp_folder_html) self.doraemon.createFilePath(self.temp_folder_img) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.source = settings_name['SOURCE_NAME'] self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.finished_content_path = settings_name['FINISHED_CONTENT_PATH'] self.finished_img_path = settings_name['FINISHED_IMG_PATH'] self.finished_processed_html_path = settings_name[ 'FINISHED_PROCESSED_HTML_PATH'] self.temp_folder_html = self.settings.TEMP_FOLDER_HTML self.temp_folder_img = self.settings.TEMP_FOLDER_IMG self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY def start_transfer(self): print 'Start {0} transfer'.format(self.name) new_ids = self.doraemon.readNewImageIds( self.doraemon.bf_finished_temp_weixin, self.finished_content_path) for id in new_ids: self.file.logger(self.log_path, 'Start transfer image: {0}'.format(id)) regx_img_file = re.compile(id) for f in os.listdir(self.finished_img_path): isValidImage = regx_img_file.match(f) if isValidImage is None: print 'Invalid image for not match: {0}'.format(f) continue from_img_path = "{0}/{1}".format(self.finished_img_path, f) to_img_path = "{0}/{1}".format(self.temp_folder_img, f) is_from_path_exists = os.path.exists(from_img_path) if is_from_path_exists is False: self.file.logger(self.log_path, 'img of {0} not exits.'.format(f)) continue copyfile(from_img_path, to_img_path) print 'Finished to transfer image {0}'.format(f) self.file.logger(self.log_path, 'Start transfer html: {0}'.format(id)) from_path = "{0}/{1}.html".format( self.finished_processed_html_path, id) to_path = "{0}/{1}.html".format(self.temp_folder_html, id) is_from_path_exists = os.path.exists(from_path) if is_from_path_exists is False: self.file.logger(self.log_path, 'html of {0} not exits.'.format(id)) continue copyfile(from_path, to_path) print 'Finished to transfer html {0}'.format(id) self.doraemon.storeFinished(self.doraemon.bf_finished_temp_weixin, id) print 'Finished to transfer {0}'.format(id)
class Chronus(): def init(self): self.getHourMinute() self.getDate() self.file = FileIOMiddleware() self.table_path = Settings.CHRONUS_SETTINGS self.base_path = Settings.RSYNC_PRD1 def createEmailBody(self): self.isReadyToSend = False if len(self.static) < 2: return self.body = '<div>--------Chronus {0} {1} -------- </div>'.format( self.YearMonthDay, self.hourMinute) self.body = '{0}<div>Total {1} Increase {2} for {3} app</div>'.format( self.body, str(self.total), str(self.increase_sum), str(len(self.preData) - 1)) self.body = '{0}<div>{1} -- {2} -- {3} -- {4}</div>'.format( self.body, 'Name', 'Pre', 'Now', 'Increase') self.isReadyToSend = True for data in self.data[0]: if data == 'time': continue dataIndex = self.data[0].index(data) if data in self.preHeader: self.body = '{0}<div>{1} -- {2} -- {3} -- {4}</div>'.format( self.body, data, self.preData[self.preHeader.index( self.data[0][dataIndex])], self.data[1][dataIndex], self.static[dataIndex]) else: self.body = '{0}<div>{1} -- {2} -- {3} -- {4}</div>'.format( self.body, data, 0, self.data[1][dataIndex], self.static[dataIndex]) def collectStatisticData(self): files = os.listdir(self.base_path) files.remove('log') isChronusExits = os.path.exists(self.table_path) if isChronusExits is True: previousData = self.file.readFromCSV(self.table_path) else: previousData = [['time']] previousData[0].extend(files) previousData.append([self.YearMonthDay]) previousData[1].extend([0 for i in range(len(files))]) self.data = [['time'], [str(self.YearMonthDay)]] for file in files: file_path = "{0}/{1}/{2}_content.csv".format( self.base_path, file, file) content = self.file.readFromCSV(file_path) self.data[0].append(file) self.data[1].append(str(len(content) - 1)) print file + ' ' + str(len(content) - 1) newHeader = self.data[0] self.preHeader = previousData[len(previousData) - 2] self.preData = previousData[len(previousData) - 1] self.static = [0] self.increase_sum = 0 self.total = 0 for header in newHeader: if header == 'time': continue self.total = self.total + int( self.data[1][self.data[0].index(header)]) if header in self.preHeader: increase = int(self.data[1][self.data[0].index(header)]) - int( self.preData[self.preHeader.index(header)]) else: increase = int(self.data[1][self.data[0].index(header)]) self.increase_sum = self.increase_sum + increase self.static.append(str(increase)) def getHourMinute(self): self.hourMinute = time.strftime('%H:%M', time.localtime(time.time())) def getDate(self): self.YearMonthDay = time.strftime('%Y-%m-%d ', time.localtime(time.time())) def updateTable(self): self.file.writeToCSVWithoutHeader(self.table_path, self.data[0]) self.file.writeToCSVWithoutHeader(self.table_path, self.data[1]) def sendEmail(self): self.createEmailBody() if self.isReadyToSend is True: host = 'smtp.163.com' port = 465 sender = '*****@*****.**' pwd = 'thebest1990' receiver = '*****@*****.**' msg = MIMEText(self.body, 'html', 'utf-8') msg['subject'] = 'pr4 chronus' msg['from'] = sender msg['to'] = receiver msg["Accept-Language"] = 'zh-CN' msg["Accept-Charset"] = 'ISO-8859-1,utf-8' try: s = smtplib.SMTP_SSL(host, port) s.login(sender, pwd) s.sendmail(sender, receiver, msg.as_string()) print('Done! Sent email success') except smtplib.SMTPException: print('Error! Sent email fail') def report(self): self.collectStatisticData() self.sendEmail() def run_chronus(self): self.init() print 'Now time is {0}'.format(self.hourMinute) self.report() print 'End report' if self.hourMinute == "20:00": print 'Start to store table' self.updateTable() print 'End to update table' del self gc.collect()