class Doraemon(): def __init__(self): settings = Settings() settings.CreateCommonSettings() self.file = FileIOMiddleware() self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT) self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS) self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT) self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS) self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL self.bf_weixin_url = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ARTICLE) self.bf_weixin_content = BloomFilter( self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE) self.bf_weixin_id = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ID) self.bf_finished_image_id = BloomFilter(self.rconn, settings.FINISHED_IMAGE_ID) self.bf_finished_temp_weixin = BloomFilter( self.rconn, settings.FINISHED_TEMP_WEIXIN) self.md5 = hashlib.md5() self.max_concurrency = settings.MAX_CONCURRENCY self.concurrency_file = settings.CONCURRENCY_FILE self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER self.bf_huxiu_nlp = BloomFilter(self.rconn, settings.FINISHED_HUXIU_NLP) self.sites_info = settings.SITES_INFO self.sites_debug = settings.SITES_DEBUG def sshUpload(self, address, port, username, password, fromFile, toFile): transport = paramiko.Transport((address, port)) try: print 'Start to upload file: {0}'.format(fromFile) transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.put(fromFile, toFile) transport.close() print 'Finished to upload file: {0}'.format(fromFile) return True except Exception as e: print 'Exception {0} to upload file: {1}'.format( e.message, fromFile) return False def moveFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.move(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Move file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to move file {1} to file {2}.".format( e.message, fromfile, tofile)) def copyFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.copy(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Copy file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to copy file {1} to file {2}.".format( e.message, fromfile, tofile)) def createFilePath(self, path): isFilePathExists = os.path.exists(path) if isFilePathExists is False: os.makedirs(path) def isExceedRestartInterval(self, path, restart_interval): isRestartPathExists = os.path.exists(path) if isRestartPathExists is False: print 'restart file does not exit and create an new one' self.file.writeToTxtCover(path, time.time()) return True past = float(self.file.readFromTxt(path)) now = time.time() isExceed = ((now - past) // 60 >= restart_interval) or ( self.disable_restart_interval is True) if isExceed is True: print 'exceeds the restart interval and restart' self.file.writeToTxtCover(path, time.time()) else: print 'does not exceed the restart interval and stop' return isExceed def isExceedTimeoutInterval(self, timeout, past): if self.isEmpty(timeout): print 'timeout is empty' return False now = time.time() isExceed = ((now - past) // 60 >= timeout) if isExceed is True: print 'exceeds the timeout.' else: print 'does not exceeds the timeout.' return isExceed def isEmpty(self, obj=None): if isinstance(obj, unicode): obj = obj.encode('utf-8') if isinstance(obj, str): return len([obj for i in obj if i.strip()]) == 0 elif isinstance(obj, int) or isinstance(obj, float): return False elif isinstance(obj, list) or isinstance(obj, dict) or isinstance( obj, tuple) or isinstance(obj, set): return len(obj) == 0 else: return obj == None def isNumber(self, item): return isinstance(item, int) or isinstance(item, float) def isTitleEmpty(self, title, url): if self.isEmpty(title): print 'Empty title for: {0}'.format(url) return True return False def isUrlValid(self, url, good_keys, bad_keys, regx, valid): is_match = False for regx_item in regx: if regx_item.match(url) != None: is_match = True if is_match == False: print 'Invalid url for not match: {0}'.format(url) return False for good in good_keys: if valid == True: continue if good in url: print 'Match good key: {0}'.format(good) valid = True for bad in bad_keys: if valid == False: continue if bad in url: print 'Match bad key: {0}'.format(bad) valid = False return valid def getImageTypeFromUrl(self, url): if 'jpeg' or '.jpg' in url: return 'jpg' if '.png' in url or 'png' in url: return 'png' if '.gif' in url or 'gif' in url: return 'gif' else: print 'Other image type use default type' return 'png' def isDuplicated(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} duplicated!'.format(content) return True else: filter.insert(content_encode) print 'Content {0} not duplicated!'.format(content) return False def isFinished(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} exists!'.format(content) return True else: return False def storeFinished(self, filter, content): print 'Start to store content: {0}'.format(content) content_encode = str(content).encode("utf-8") filter.insert(content_encode) def storeMongodb(self, mongo_url, data): mongo = MongoMiddleware() mongo.insert(mongo_url, data) def storeTxt(self, id, content, finished_txt_path, name): try: self.createFilePath(finished_txt_path) print 'Start to store txt: {0}'.format(id) self.file.writeToTxtCover( '{0}/{1}_{2}.txt'.format(finished_txt_path, name, id), content) print 'End to store txt: {0}'.format(id) except Exception as e: print 'Exception {0} to store txt: {1}'.format(e.message, id) def storeTxtAdd(self, author_txt_path, author_name, settingName): try: self.createFilePath(author_txt_path) print 'Start to store txt: {0}'.format(author_name) self.file.writeToTxtAdd( '{0}/{1}_authors.txt'.format(author_txt_path, settingName), author_name) except Exception as e: print 'Exception to store txt: {0} , for {1}'.format( author_name, e.strerror) print 'End to store txt: {0}'.format(author_name) def storeHtml(self, id, content, finished_html_path): try: self.createFilePath(finished_html_path) print 'Start to store html: {0}'.format(id) self.file.writeToHtmlCover( '{0}/{1}.html'.format(finished_html_path, id), content) print 'End to store html: {0}'.format(id) return True except Exception as e: print 'Exception {0} to store html: {1}'.format(e.message, id) return False def filter(self, filter, url_titles): new_url_titles = [] for url_title in url_titles: if self.isFinished(filter, url_title[1]) is False: new_url_titles.append(url_title) return new_url_titles def imageFilter(self, filter, ids): new_ids = [] for id in ids: if self.isFinished(filter, id) is False: new_ids.append(id) return new_ids def readNewUrls(self, filter, url_path): print 'Start to read urls' isUrlPathExit = os.path.exists(url_path) new_url_titles = [] if isUrlPathExit is True: url_titles = np.array( self.file.readColsFromCSV(url_path, ['url', 'title'])) new_url_titles = self.filter(filter, url_titles) return new_url_titles def readNewImageIds(self, filter, content_path): print 'Start to read ids' isContentPathExit = os.path.exists(content_path) new_ids = [] id_list = [] if isContentPathExit is True: ids = np.array(self.file.readColsFromCSV(content_path, ['id'])) for id in ids: id_list.append(id[0]) new_ids = self.imageFilter(filter, id_list) return new_ids def downloadImage(self, image_url, store_path, image_name): try: self.createFilePath(store_path) print 'start to download image: {0}'.format(image_url) urllib.urlretrieve(image_url, '{0}/{1}'.format(store_path, image_name)) return True except Exception as e: print 'exception to download image: {0} for {1}'.format( image_url, e.message) return False def hashSet(self, name, key, value): self.rconn.hset(name, key, value) def getHashSet(self, name, key): return self.rconn.hget(name, key) def getAllHasSet(self, name): return self.rconn.hgetall(name) def delHashSet(self, name, key): return self.rconn.hdel(name, key) def delKey(self, key): return self.rconn.delete(key) def getKeyLen(self, key): return self.rconn.hlen(key) def getDateOfDaysBefore(self, days): return (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d") def getCurrentYear(self): return time.strftime('%Y', time.localtime(time.time())) def getCurrentDate(self): return time.strftime('%Y-%m-%d', time.localtime(time.time())) def getCurrentLocalTime(self): return datetime.now().strftime('%Y-%m-%d %H:%M:%S') def getDateTime(self, string, dateFormat, pattern, isMatchDate): try: match = re.search(dateFormat, string) strp = datetime.strptime(match.group(), pattern) if isMatchDate: print "'Match date success: {0}".format(strp.date()) return strp.date() else: print "'Match date success: {0}".format(strp.date()) return strp.date() except: print("'Match date fail") return None def getDateFromChinese(self, string): year = self.getCurrentYear() try: if "今天" in string or \ "秒前" in string or \ "分钟前" in string or \ "小时前" in string or \ "Today" in string: return self.getDateOfDaysBefore(0) if "昨天" in string or \ "1天前" in string or \ "Yesterday" in string: return self.getDateOfDaysBefore(1) if "前天" in string or \ "2天前" in string or \ "2 days ago" in string: return self.getDateOfDaysBefore(2) if "3天前" in string or \ "3 days ago" in string: return self.getDateOfDaysBefore(3) if "4天前" in string or \ "4 days ago" in string: return self.getDateOfDaysBefore(4) if "5天前" in string or \ "5 days ago" in string: return self.getDateOfDaysBefore(5) if "6天前" in string or \ "6 days ago" in string: return self.getDateOfDaysBefore(6) if "1周前" in string or \ "1 week ago" in string: return self.getDateOfDaysBefore(7) if "年" not in string and "月" in string and "日" in string: data = re.split(",", string.replace('月', ',').replace('日', '')) return "{0}-{1}-{2}".format(year, self.getNumberFromString(data[0]), self.getNumberFromString(data[1])) if "年" in string and "月" in string and "日" in string: data = re.split( ",", string.replace('年', ',').replace('月', ',').replace('日', ',')) return "{0}-{1}-{2}".format(self.getNumberFromString(data[0]), self.getNumberFromString(data[1]), self.getNumberFromString(data[2])) except: print("Fail to match date from Chinese.") return None def getNumberFromString(self, string): return ''.join(re.findall(r'\d+', string)).strip() def getFinalDate(self, year, month, day): return "{0}-{1}-{2}".format(year, month, day) def formateMonthDay(self, MD): return '{:02d}'.format(MD) def getDateFromString(self, string_date): _date_chinese = self.getDateFromChinese(string_date) if _date_chinese is not None: string_date = _date_chinese _date_year_month_day_crossing = self.getDateTime( string_date, r'\d{4}-\d{1,2}-\d{1,2}', '%Y-%m-%d', True) _date_year2_month_day_crossing = self.getDateTime( string_date, r'\d{2}-\d{1,2}-\d{1,2}', '%y-%m-%d', True) _date_month_day_crossing = self.getDateTime(string_date, r'\d{1,2}-\d{1,2}', '%m-%d', True) _date_year_month_day_dot = self.getDateTime(string_date, r'\d{4}.\d{1,2}.\d{1,2}', '%Y.%m.%d', True) _date_month_day_dot = self.getDateTime(string_date, r'\d{1,2}.\d{1,2}', '%m.%d', True) _date_year_month_day_slash = self.getDateTime( string_date, r'\d{4}\/\d{1,2}\/\d{1,2}', '%Y/%m/%d', True) _date_month_day_slash = self.getDateTime(string_date, r'\d{1,2}\/\d{1,2}', '%m/%d', True) _time_hour_minute_second = self.getDateTime( string_date, r'\d{1,2}:\d{1,2}:\d{1,2}', '%H:%M:%S', False) _time_hour_minute = self.getDateTime(string_date, r'\d{1,2}:\d{1,2}', '%H:%M', False) year = self.getCurrentYear() if _date_year_month_day_crossing is not None: return self.getFinalDate( _date_year_month_day_crossing.year, self.formateMonthDay(_date_year_month_day_crossing.month), self.formateMonthDay(_date_year_month_day_crossing.day)) if _date_year2_month_day_crossing is not None: return self.getFinalDate( _date_year2_month_day_crossing.year, self.formateMonthDay(_date_year2_month_day_crossing.month), self.formateMonthDay(_date_year2_month_day_crossing.day)) if _date_year_month_day_crossing is None and _date_month_day_crossing is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_crossing.month), self.formateMonthDay(_date_month_day_crossing.day)) if _date_year_month_day_dot is not None: return self.getFinalDate( _date_year_month_day_dot.year, self.formateMonthDay(_date_year_month_day_dot.month), self.formateMonthDay(_date_year_month_day_dot.day)) if _date_year_month_day_dot is None and _date_month_day_dot is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_dot.month), self.formateMonthDay(_date_month_day_dot.day)) if _date_year_month_day_slash is not None: return self.getFinalDate( _date_year_month_day_slash.year, self.formateMonthDay(_date_year_month_day_slash.month), self.formateMonthDay(_date_year_month_day_slash.day)) if _date_year_month_day_slash is None and _date_month_day_slash is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_slash.month), self.formateMonthDay(_date_month_day_slash.day)) if _time_hour_minute_second is not None or _time_hour_minute is not None: return self.getCurrentDate() def getMD5(self, content): self.md5.update(content.encode('utf-8')) return self.md5.hexdigest() def compressImage(self, origin_image_path, destination_image_path, multiplier): try: sImg = Image.open(origin_image_path) w, h = sImg.size dImg = sImg.resize((int(w / multiplier), int(h / multiplier)), Image.ANTIALIAS) os.remove(origin_image_path) dImg.save(destination_image_path) print "Compress picture {0} success!".format( destination_image_path) except Exception as e: print "Compress picture {0} failed for {1}".format( destination_image_path, e.message) def getSizeOfImage(self, image_path): try: img = Image.open(image_path) return img.size except Exception as e: print "Exception to open picture {0}, for {1}.".format( image_path, e.message) def getFileSize(self, file_path): try: fsize = os.path.getsize(file_path) fsize = fsize / float(1024) return round(fsize, 2) except Exception as e: print "Exception to get file size of {0}, for {1}.".format( file_path, e.message) def getFileList(self, diractory): file_list = [] isFilePathExists = os.path.exists(diractory) if isFilePathExists is True: file_list = os.listdir(diractory) return file_list def isFileExists(self, file_path): return os.path.exists(file_path) def deleteFile(self, file_path): try: print "Start to delete file: {0}".format(file_path) os.remove(file_path) print "Finished to delete file: {0}".format(file_path) except Exception as e: print "Exception to delete file: {0} for : {1}".format( file_path, e.message) def tar(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) def tarList(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) lst = [] t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) lst.append(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) return lst except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) return [] def isCamelReadyToRun(self, settings): if self.isWorkTime(settings.START_TIME, settings.END_TIME) is False: return False if self.isConcurrencyAllowToRun(self.concurrency_refresh_file, self.refresh_concurrency_interval, self.concurrency_file, self.max_concurrency) is False: return False if self.isExceedRestartInterval(settings.RESTART_PATH, settings.RESTART_INTERVAL) is False: self.recoveryConcurrency(self.concurrency_file, self.max_concurrency) return False return True def isSpiderReadyToRun(self): return self.isConcurrencyAllowToRun( self.concurrency_refresh_file_spider, self.refresh_concurrency_interval_spider, self.concurrency_file_spider, self.max_concurrency_spider) def isWorkTime(self, start_time, end_time): if self.isNumber(start_time) is False: print 'start time is empty' return False if self.isNumber(end_time) is False: print 'end time is empty' return False if self.isAfterHour(start_time) and self.isBeforeHour(end_time): print 'it is work time' return True else: print 'it is not work time before {0} or after {1}'.format( start_time, end_time) return False def isAfterHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) < int(current_time): return True else: return False def isBeforeHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) >= int(current_time): return True else: return False def readFile(self, file): waiting = 0 data = self.file.readFromTxt(file).strip() while self.isEmpty(data): print 'file {0} is under update, waitting... {1} s'.format( file, waiting) time.sleep(1) waiting += 1 data = self.file.readFromTxt(file).strip() return data def isConcurrencyAllowToRun(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): self.updateConcurrencyFile(concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency) isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists : {0}'.format( str(concurrency_available)) if int(concurrency_available) > 0: print 'app is able to run.' new_concurrency_available = concurrency_available - 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) return True else: print 'app is not able to run for no available concurrency.' return False return True def recoveryConcurrency(self, concurrency_file, max_concurrency): isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) return concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists and start to recovery: {0}'.format( str(concurrency_available)) if int(concurrency_available) < max_concurrency: print 'start to recovery concurrenct.' new_concurrency_available = concurrency_available + 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) else: print 'concurrency is not normal and write max concurrency to it.' self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def updateConcurrencyFile(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): if self.isExceedRestartInterval(concurrency_refresh_file, refresh_concurrency_interval) is True: print 'refresh concurrency file: {0}'.format(str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def createCamelData(self, title, url, id, download_time, source): return camelDto(title, url, id, download_time, source) def createCamelMongoJson(self, camelDto): return { 'title': camelDto.title, 'url': camelDto.url, 'id': camelDto.id, 'download_time': camelDto.download_time, 'source': camelDto.source } def createSpiderData(self, url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content): return spiderDto(url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content) def createSpiderMongoJson(self, spiderDto): return { 'url': spiderDto.url, 'origin_url,': spiderDto.origin_url, 'public_time': spiderDto.public_time, 'author_name': spiderDto.author_name, 'title': spiderDto.title, 'id': spiderDto.id, 'download_time': spiderDto.download_time, 'source': spiderDto.source, 'images': spiderDto.images, 'is_open_cache': spiderDto.is_open_cache } def updateImages(self, images, newImages): for image in newImages: data = image.strip() if self.isEmpty(data) is False and data not in images: images.append(data) def completeImageUrls(self, newImages, current_url): result = [] if len(newImages) == 0: print 'No images urls to process' return result for url in newImages: entireUrl = urlparse.urljoin(current_url, url).strip() if re.match('https', entireUrl) is not None: result.append(entireUrl) return result def getSitesInfo(self, isdebug=False): if isdebug: site_info_path = self.sites_debug else: site_info_path = self.sites_info content = self.file.readFromTxt(site_info_path) if self.isEmpty(content): print 'sites info is empty' return None sitesInfo = content.split('[SITE]') results = [] for site in sitesInfo: if self.isEmpty(site): continue results.append(self.extractSiteInfo(site)) return results def extractSiteInfo(self, siteInfo): items = siteInfo.split('\n') result = siteInfoDto(domain=None, name=None, restart_interval=None, url_parallel_number=None, content_parallel_number=None, is_open_cache=None, work_time_start=None, work_time_end=None, good_keys=[], bad_keys=[], href_items=[], href=[], url_match=[], url_title_match=[], url_id_tag=[], content_match=[], content_child_match=[], content_url_match=[], content_id_tag=[], article_match=[], content_title_match=[], content_image_match=[], content_time_match=[], need_self_image=None, url_timeout=None, content_timeout=None) for item in items: if self.isEmpty(item): continue content = item.split('==') key = ''.join(content[0]).strip() value = ''.join(content[1]).strip() if key == 'DOMAIN': result.domain = value continue if key == 'NAME': result.name = value continue if key == 'RESTARTINTERVAL': result.restart_interval = int(value) continue if key == 'URLPARALLELNUMBER': result.url_parallel_number = int(value) continue if key == 'CONTENTPARALLELNUMBER': result.content_parallel_number = int(value) continue if key == 'ISOPENCACHE': result.is_open_cache = bool(value) continue if key == 'WORKTIMESTART': result.work_time_start = int(value) continue if key == 'WORKTIMEEND': result.work_time_end = int(value) continue if key == 'GOODKEYS': if self.isEmpty(value) is False: result.good_keys.append(value) continue if key == 'BADKEYS': if self.isEmpty(value) is False: result.bad_keys.append(value) continue if key == 'URLMATCH': if self.isEmpty(value) is False: result.url_match.append(self.extractRegxRule(value)) continue if key == 'URLTITLEMATCH': if self.isEmpty(value) is False: result.url_title_match.append(self.extractHtmlTag(value)) continue if key == 'URLIDTAG': if self.isEmpty(value) is False: result.url_id_tag.append(self.extractHtmlTag(value)) continue if key == 'CONTENTURLMATCH': if self.isEmpty(value) is False: result.content_url_match.append( self.extractRegxRule(value)) continue if key == 'CONTENTIDTAG': if self.isEmpty(value) is False: result.content_id_tag.append(self.extractHtmlTag(value)) continue if key == 'HREFITEMS': if self.isEmpty(value) is False: result.href_items.append(self.extractHtmlTag(value)) continue if key == 'HREF': if self.isEmpty(value) is False: result.href.append(self.extractHtmlTag(value)) continue if key == 'ARTICLEMATCH': if self.isEmpty(value) is False: result.article_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTMATCH': if self.isEmpty(value) is False: result.content_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTCHILDMATCH': if self.isEmpty(value) is False: result.content_child_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTITLEMATCH': if self.isEmpty(value) is False: result.content_title_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTIMAGEMATCH': if self.isEmpty(value) is False: result.content_image_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTIMEMATCH': if self.isEmpty(value) is False: result.content_time_match.append( self.extractHtmlTag(value)) continue if key == 'NEEDSELFIMAGE': if self.isEmpty(value) is False: result.need_self_image = value == 'True' if key == 'NEEDSELFHTML': if self.isEmpty(value) is False: result.need_self_html = value == 'True' if key == 'URLTIMEOUT': if self.isEmpty(value) is False: result.url_timeout = value if key == 'CONTENTTIMEOUT': if self.isEmpty(value) is False: result.content_timeout = value return result def getUrlId(self, url, idTag): id = None for item in idTag: matchItem = item.regx if matchItem in url: index = url.index(item.regx) + item.index if len(url) <= index: continue id = url[index] if id == None: continue return id def extractRegxRule(self, regxMatch): return re.compile(regxMatch) def extractHtmlTag(self, regxMatch): items = regxMatch.split('|') id = ''.join(items[0]).strip() index = int(items[1]) return regxMatchDto(id, index) def getMatchContent(self, content, regx): if regx.index == -1 or len(content) == 0: return content return content[regx.index] def uploadFileApi(self, url, fileName, fullPath): try: with open(fullPath, mode="r") as f: file = {"file": (fileName, f.read())} encode_data = encode_multipart_formdata(file) file_data = encode_data[0] headers_from_data = {"Content-Type": encode_data[1]} response = requests.post(url=url, headers=headers_from_data, data=file_data).json() if response['code'] != 200: print 'Fail to upload file {0} through api {1}'.format( fileName, url) return False print 'Success to upload file {0} through api {1}'.format( fileName, url) return True except Exception as e: print 'Exception to upload file {0} through api {1} : {2}'.format( fileName, url, e.message) return False
class UpdateMonitorFiles(): def __init__(self, siteinfo=None): self.siteinfo = siteinfo self.globalSettings = Settings() self.doraemon = Doraemon() self.getSettings() self.file = FileIOMiddleware() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.work_path_prd4 = self.settings.WORK_PATH_PRD1 self.work_path_prd3 = self.settings.WORK_PATH_PRD2 self.content_backup_path = self.settings.FINISHED_BACKUP_PATH self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH self.url_backup_path = self.settings.URL_BACKUP_PATH self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0 self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0 def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2, tcount2, turl2, diff2): return '<tr>' + \ '<th align="center" valign="middle">{0}</th>'.format(siteName) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \ '<td align="center" valign="middle">{0}</td>'.format(diff1) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \ '<td align="center" valign="middle">{0}</td>'.format(diff2) + \ '</tr>' def updateSite(self, number, title, url): return '<tr>' + \ '<td align="center" valign="middle">{0}</td>'.format(number) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \ '</tr>' def uploadFile(self, fromFile, toFile): while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.globalSettings.IP_WEBSERVER0, self.globalSettings.PORT_WEBSERVER0, self.globalSettings.USER_ROOT_WEBSERVER0, self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): print 'Success to retry to upload monitor file: {0}'.format( fromFile) return True except Exception as e: print 'Exception {0} to upload monitor site file: {1}'.format( e.message, fromFile) return False def updateSingleSite(self, preBackupPath, postBackupPath, siteName): singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0) isPreBackupFileExists = os.path.exists(preBackupPath) isPostBackupFileExists = os.path.exists(postBackupPath) preCsvContent = None if isPreBackupFileExists: print "Start to read url back up file: {0}".format( self.settings.NAME) preCsvContent = self.file.readColsFromCSV(preBackupPath, ['title', 'url']) singleSiteData.tcount = len(preCsvContent.values) else: print "Url back up file not exits: {0}".format(self.settings.NAME) singleSiteData.tcount = 0 if isPostBackupFileExists: print "Start to read post url back up file: {0}".format( self.settings.NAME) postCsvContent = self.file.readColsFromCSV(postBackupPath, ['title', 'url']) singleSiteData.ycount = len(postCsvContent.values) else: print "Post url back up file not exits: {0}".format( self.settings.NAME) singleSiteData.ycount = 0 singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount if preCsvContent is not None: if preCsvContent.empty: print "No new back up url: {0}".format(self.settings.NAME) else: template = self.file.readFromTxt( self.monitor_site_template_path) finalContent = '' number = 1 for item in preCsvContent.values: finalContent = "{0}{1}".format( finalContent, self.updateSite(number, item[1], item[0])) number += 1 template = template.replace( 'UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('ServerName', siteName) template = template.replace('SiteName', self.siteinfo.name) template = template.replace('MainContent', finalContent) turl = '{0}{1}_{2}.html'.format(self.monitor_site_url, self.settings.NAME, siteName) singleSiteData.turl = turl uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format( self.monitor_upload_local, self.settings.NAME, siteName) self.file.writeToHtmlCover(uploadLocalHtmlPath, template) return singleSiteData def processAllSites(self, allSitesData=None): template = self.file.readFromTxt(self.monitor_spiders_template_path) mainContent = '' t = totalDto(0, 0, 0, 0, 0, 0) for data in allSitesData: mainContent = '{0}{1}'.format( mainContent, self.updateSpiders(data.prd3.sitename, data.prd3.ycount, data.prd3.tcount, data.prd3.turl, data.prd3.diff, data.prd4.ycount, data.prd4.tcount, data.prd4.turl, data.prd4.diff)) t.prd3ytotal += data.prd3.ycount t.prd3ttotal += data.prd3.tcount t.prd4ytotal += data.prd4.ycount t.prd4ttotal += data.prd4.tcount t.prd3difftotal = t.prd3ttotal - t.prd3ytotal t.prd4difftotal = t.prd4ttotal - t.prd4ytotal mainContent = '{0}{1}'.format( mainContent, self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '', t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '', t.prd4difftotal)) template = template.replace('UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('MainContent', mainContent) localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local) self.file.writeToHtmlCover(localHtmlPath, template) self.doraemon.tar(self.monitor_upload_local) fromFile = '{0}.tar.gz'.format(self.monitor_upload_local) self.uploadFile( fromFile, '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0)) os.remove(fromFile) def processSingleSite(self): spidersContent = allSitesDto(None, None) spidersContent.prd3 = self.updateSingleSite(self.url_backup_path, self.url_backup_post_path, 'prd3') spidersContent.prd4 = self.updateSingleSite( self.content_backup_path, self.content_backup_post_path, 'prd4') return spidersContent