def downloadRequestsImage3(imgUrl, refererUrl, outputConfig): """ Downloads a single image using the Requests package """ if (imgUrl in urlsCache): logger.info("Cache Hit for: " + imgUrl) return None else: urlsCache.add(imgUrl) logger.info("urlsCache updated, current size: " + str(len(urlsCache))) if imgUrl != None: file_name = buildLocalFilename(imgUrl) if file_name != "": # retries = urllib3.util.retry.Retry(connect=5, read=2, redirect=5) # todo: fix hardcoding # req = urllib3.PoolManager(retries=retries) downloads() # log a counter of a download being made headers = { 'user-agent': downloaderConfig.getUserAgent(), 'referer': refererUrl } try: resp = requests.get(imgUrl.strip(), headers=headers) printAccessLog(str(resp.status_code), resp.headers, imgUrl) if resp.status_code != requests.codes.ok: logger.error("error response code: " + str(resp.status_code) + " for image URL: " + imgUrl) return None if 'Content-Length' in resp.headers and resp.headers[ 'Content-Length'] != None: if (int(resp.headers['Content-Length']) < int( outputConfig['minimumContentLength'])): logger.info( "Image Content-Length is less than the minimum (" + outputConfig['minimumContentLength'] + ")") return None else: urllib.request.urlretrieve(imgUrl, buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], \ getOutputFolder(outputConfig), file_name)) logger.debug("image saved(" + resp.headers['Content-Type'] + "), filename: " + file_name) else: logger.error("Content-Length == None for URL: " + imgUrl) except requests.exceptions.ConnectionError as e1: logger.error("A ConnectionError occurred while requesting " + imgUrl) logError(e1) except requests.exceptions.TooManyRedirects as e2: logger.error("A TooManyRedirects occurred while requesting " + imgUrl) logError(e2) except urllib.error.HTTPError as e3: logger.error( "A urllib.error.HTTPError occurred while requesting " + imgUrl) logError(e3)
def get_videos(soup, baseUrl, outputConfig, exceptionsConfig): localheaders = {'User-Agent': downloaderConfig.getUserAgent()} with requests.Session() as session: session.headers = localheaders try: localresponse = session.get(baseUrl) localsoup = BeautifulSoup(localresponse.content, "html.parser") # follow the iframe url iframes = localsoup.findAll('iframe', src=True) for soup_iframe in iframes: src = soup_iframe['src'] if src: iframeResponse = session.get(soup_iframe['src'], headers={'Referer': baseUrl}) iframeSoup = BeautifulSoup(iframeResponse.content, "html.parser") iframeSources = [source for source in iframeSoup.findAll('source')] for iframeSourceLink in iframeSources: pos = iframeSourceLink['type'].rfind('/') type = iframeSourceLink['type'][:pos] extension = iframeSourceLink['type'][pos+1:] if type == 'video': result = downloadRequestsVideo3(iframeSourceLink['src'], extension, baseUrl, outputConfig) except requests.exceptions.ConnectionError as e1: logger.error("A ConnectionError occurred.") logError(e1) except requests.exceptions.TooManyRedirects as e2: logger.error("A TooManyRedirects occurred.") logError(e2) except urllib.error.HTTPError as e3: logger.error("A urllib.error.HTTPError occurred.") logError(e3) except requests.exceptions.MissingSchema as e4: logger.error("A requests.exceptions.MissingSchema occurred.") logError(e4) # Check whether the file is the minimum size and copies it if it meets the minimum size tmpFileLocation = buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], getOutputFolder(outputConfig), "") copyCounter = 0 for eachVideo in os.listdir(tmpFileLocation): videoFile = tmpFileLocation+eachVideo if (os.path.getsize(videoFile) > (int(outputConfig['minimumFileSize'])*1000)): logger.info("File " + videoFile + " is greater than " + str(outputConfig['minimumFileSize']) + "k") # Copy the file to the actual location saved() # log a counter of a object being moved os.makedirs(getOutputFolder(outputConfig), exist_ok=True) shutil.move(videoFile, getOutputFolder(outputConfig)+eachVideo) copyCounter += 1 # Delete the file shutil.rmtree(tmpFileLocation) logger.info("Temporary files and folder deleted.") logger.info(str(copyCounter)+" videos saved.")
def downloadRequestsImage3(imgUrl, refererUrl, outputConfig): """ Downloads a single image using the Requests package """ if (imgUrl in urlsCache): logger.info("Cache Hit for: "+imgUrl) return None else: urlsCache.add(imgUrl) logger.info("urlsCache updated, current size: "+str(len(urlsCache))) if imgUrl != None: file_name = buildLocalFilename(imgUrl) if file_name != "": # retries = urllib3.util.retry.Retry(connect=5, read=2, redirect=5) # todo: fix hardcoding # req = urllib3.PoolManager(retries=retries) downloads() # log a counter of a download being made headers = {'user-agent': downloaderConfig.getUserAgent(), 'referer': refererUrl} try: resp = requests.get(imgUrl.strip(), headers=headers) printAccessLog(str(resp.status_code), resp.headers, imgUrl) if resp.status_code != requests.codes.ok: logger.error("error response code: "+str(resp.status_code)+" for image URL: "+imgUrl) return None if 'Content-Length' in resp.headers and resp.headers['Content-Length'] != None: if (int(resp.headers['Content-Length']) < int(outputConfig['minimumContentLength'])): logger.info("Image Content-Length is less than the minimum ("+outputConfig['minimumContentLength']+")") return None else: urllib.request.urlretrieve(imgUrl, buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], \ getOutputFolder(outputConfig), file_name)) logger.debug("image saved("+resp.headers['Content-Type']+"), filename: "+file_name) else: logger.error("Content-Length == None for URL: "+imgUrl) except requests.exceptions.ConnectionError as e1: logger.error("A ConnectionError occurred while requesting "+imgUrl) logError(e1) except requests.exceptions.TooManyRedirects as e2: logger.error("A TooManyRedirects occurred while requesting "+imgUrl) logError(e2) except urllib.error.HTTPError as e3: logger.error("A urllib.error.HTTPError occurred while requesting "+imgUrl) logError(e3)
def get_videos(soup, baseUrl, outputConfig, exceptionsConfig): localheaders = {'User-Agent': downloaderConfig.getUserAgent()} with requests.Session() as session: session.headers = localheaders try: localresponse = session.get(baseUrl) localsoup = BeautifulSoup(localresponse.content, "html.parser") # follow the iframe url iframes = localsoup.findAll('iframe', src=True) for soup_iframe in iframes: src = soup_iframe['src'] if src: iframeResponse = session.get(soup_iframe['src'], headers={'Referer': baseUrl}) iframeSoup = BeautifulSoup(iframeResponse.content, "html.parser") iframeSources = [ source for source in iframeSoup.findAll('source') ] for iframeSourceLink in iframeSources: pos = iframeSourceLink['type'].rfind('/') type = iframeSourceLink['type'][:pos] extension = iframeSourceLink['type'][pos + 1:] if type == 'video': result = downloadRequestsVideo3( iframeSourceLink['src'], extension, baseUrl, outputConfig) except requests.exceptions.ConnectionError as e1: logger.error("A ConnectionError occurred.") logError(e1) except requests.exceptions.TooManyRedirects as e2: logger.error("A TooManyRedirects occurred.") logError(e2) except urllib.error.HTTPError as e3: logger.error("A urllib.error.HTTPError occurred.") logError(e3) except requests.exceptions.MissingSchema as e4: logger.error("A requests.exceptions.MissingSchema occurred.") logError(e4) # Check whether the file is the minimum size and copies it if it meets the minimum size tmpFileLocation = buildTmpFileOutputLocation( outputConfig['outputTmpFolderPrefix'], getOutputFolder(outputConfig), "") copyCounter = 0 for eachVideo in os.listdir(tmpFileLocation): videoFile = tmpFileLocation + eachVideo if (os.path.getsize(videoFile) > (int(outputConfig['minimumFileSize']) * 1000)): logger.info("File " + videoFile + " is greater than " + str(outputConfig['minimumFileSize']) + "k") # Copy the file to the actual location saved() # log a counter of a object being moved os.makedirs(getOutputFolder(outputConfig), exist_ok=True) shutil.move(videoFile, getOutputFolder(outputConfig) + eachVideo) copyCounter += 1 # Delete the file shutil.rmtree(tmpFileLocation) logger.info("Temporary files and folder deleted.") logger.info(str(copyCounter) + " videos saved.")