def saveFileName(_imgUrl): fName = os.path.split(_imgUrl)[1] getLogger().debug('file name ::{}'.format(fName)) ext = '.jpg' reg = re.compile(r'[.](jpg|png|bmp|gif)$') mo = reg.search(fName) if mo == None: return fName + ext return fName
def __nextDepth__(self, _depUrl, _soup, sSaveDir, _maxDepth, _depth): print('depth:' + str(_depth) + '#' * 100) depthElms = _soup.select('a') getLogger().debug('depthElms count:{}'.format(len(depthElms))) if len(depthElms) > 0: hrefl = [] for link in depthElms: if link.get('href') == None or len( link.get('href')) == 0 or link.get('href').startswith( 'http') == False: continue if _depUrl == link.get('href'): continue hrefl.append(link.get('href')) linkLen = len(hrefl) for index in range(0, linkLen): print('-' * 100) getLogger().info('Dep {}/{}'.format(_depth, _maxDepth)) subDir = sSaveDir + '/Depth_' + str(_depth) + (os.path.split( hrefl[index])[1]) if os.path.exists(subDir) == False or os.path.isdir( subDir) == False: os.mkdir(subDir) getLogger().info('{}/{} -- {}'.format(index, linkLen, hrefl[index])) getLogger().info('dir -- {}'.format(subDir)) print('-' * 100) self.__crawring(hrefl[index], subDir, _maxDepth, _depth)
def __init__(self, lUrls=None, lSaveDirs=None, bMp=0, bZip=False, maxDepth=0, proxy=False): getLogger().info('__init__ called!!') self.__lUrls = lUrls self.__lSaveDirs = lSaveDirs self.__lProcs = [] self.__bMp = bMp self.__bZip = bZip self.__maxDepth = maxDepth self.__nJobCount = 0 self.__proxies = self.__select_proxy__(self.__get_proxies__()) self.__isProxy = proxy
def main(): init() getOption() loadProperties() getLogger().debug(d['dirs']) getLogger().debug(d['urls']) getLogger().debug('mp : {} , z:{} , d:{}'.format(getMPOption(), getZipOption(), getMaxDepth())) getLogger().info('-- Crawling ... start ::{}'.format(getDefCurrentTime())) # RUN crawler = mcrawler.MCrawler(d['urls'], d['dirs'], getMPOption(), getZipOption(), getMaxDepth(), getProxyOption()) crawler.run() getLogger().info('-- Crawling ... end ::{}'.format(getDefCurrentTime()))
def __download_image(self, _list, sSaveDir): getLogger().info('-- pid:{} -- image download start'.format( os.getpid())) session = requests.session() for downUrl in _list: try: if self.__isProxy: session.proxies = self.__proxies downUrlRes = session.get(downUrl) downUrlRes.raise_for_status() # 파일이름 생성 nFileName = self.__getSaveFileName__(downUrl) wFileName = sSaveDir + '/' + nFileName getLogger().debug('save file path :' + wFileName) print('Downloading...%s' % (nFileName)) fw = open(wFileName, 'wb') for chunk in downUrlRes.iter_content(100000): fw.write(chunk) fw.close() except: print('Download Error...%s' % (os.path.split(downUrl)[1])) getLogger().info('-- pid:{} -- image download end'.format( os.getpid()))
def __crawring(self, sUrl, sSaveDir, _maxDepth, _depth): # getLogger().info('pid -- {}:{}'.format(os.getpid(),sUrl)) getLogger().info('thread -- {}'.format(sUrl)) try: session = requests.session() ## Proxy 설정이 있다면.. if self.__isProxy: print('#' * 100) print('proxy ip :{}'.format(self.__proxies)) print('#' * 100) session.proxies = self.__proxies res = session.get(sUrl) res.raise_for_status() getLogger().info('text :{}'.format(res.text)) soup = bs4.BeautifulSoup(res.text) imgElms = soup.select('img') getLogger().info('len :{}'.format(len(imgElms))) validUrl = [] if len(imgElms) > 0: for elm in imgElms: downUrl = elm.get('src') #-- url 유효성 체크 if downUrl == None: continue rgx = re.compile('^[.]{1,2}') mo = rgx.search(downUrl) if mo != None: rgx = re.compile('^https?.+(com|net|edu|org)') mo = rgx.search(sUrl) if mo != None: downUrl = str(mo.group()) + '/' + downUrl[2:] elif downUrl.startswith('http') == False: downUrl = 'https:' + downUrl # 유효한 url을 리스트에 저장 validUrl.append(downUrl) #--File Download and Save getLogger().info('-url:{} , mp count : {}'.format( len(validUrl), self.__bMp)) nMPCount = self.__bMp if nMPCount > 1 and len(validUrl) > 5: # -- url count보다 Mp count가 클경우 url count값으로 mp count를 조절한 if len(validUrl) < nMPCount: nMPCount = len(validUrl) - 1 threads = [] bit = int(len(validUrl) / nMPCount) with concurrent.futures.ThreadPoolExecutor( max_workers=nMPCount) as te: for index in range(0, nMPCount): start = index * bit end = start + bit if index == (nMPCount - 1): end = len(validUrl) th = te.submit(self.__download_image, validUrl[start:end], sSaveDir) threads.append(th) for th in concurrent.futures.as_completed(threads): print('{}'.format(th.result())) else: self.__download_image(validUrl, sSaveDir) # Zip압축 if self.__bZip == True: mzip.MZip.compress(sSaveDir, logging) # NEXT DEPTH getLogger().info('_maxDepth :{} , _depth:{}'.format( _maxDepth, _depth)) if _maxDepth != 0 and _depth < _maxDepth: _depth += 1 self.__nextDepth__(sUrl, soup, sSaveDir, _maxDepth, _depth) except Exception as ex: getLogger().info('{}'.format(ex)) return '{} ==> complete'.format(sUrl)
def run(): global d getLogger().debug('adfadfa') url = d['urls'][0] getLogger().debug('-- url:{}'.format(url)) res = requests.get(url, timeout=3) getLogger().debug(res.text) res.raise_for_status() soup = bs4.BeautifulSoup(res.text) elms = soup.select('img') getLogger().debug('-- step1 len:{}'.format(len(elms))) downUrls = [] for e in elms: imgUrl = e.get('src') # getLogger().debug('-- step2 imgUrl:{}'.format(imgUrl)) if imgUrl.startswith('http') == False: continue downUrls.append(imgUrl) getLogger().debug('-- downUrls len:{}'.format(len(downUrls))) for durl in downUrls: fw = None try: getLogger().debug('-- step2 durl:{}'.format(durl)) downRes = requests.get(durl, timeout=3) downRes.raise_for_status() fName = saveFileName(durl) getLogger().debug('-- fName :{}'.format(fName)) fw = open(d['dirs'][0] + '/' + fName, 'wb') print('Downloading.. {}'.format(fName)) for chunk in downRes.iter_content(100000): fw.write(chunk) except: print('Downloading Failed.. {}'.format(fName)) finally: if fw != None: fw.close()