Пример #1
0
def downloadRequestsImage3(imgUrl, refererUrl, outputConfig):
    """ Downloads a single image using the Requests package
    """
    if (imgUrl in urlsCache):
        logger.info("Cache Hit for: " + imgUrl)
        return None
    else:
        urlsCache.add(imgUrl)
        logger.info("urlsCache updated, current size: " + str(len(urlsCache)))

    if imgUrl != None:
        file_name = buildLocalFilename(imgUrl)
        if file_name != "":
            # retries = urllib3.util.retry.Retry(connect=5, read=2, redirect=5) # todo: fix hardcoding
            # req = urllib3.PoolManager(retries=retries)
            downloads()  # log a counter of a download being made
            headers = {
                'user-agent': downloaderConfig.getUserAgent(),
                'referer': refererUrl
            }
            try:
                resp = requests.get(imgUrl.strip(), headers=headers)
                printAccessLog(str(resp.status_code), resp.headers, imgUrl)
                if resp.status_code != requests.codes.ok:
                    logger.error("error response code: " +
                                 str(resp.status_code) + " for image URL: " +
                                 imgUrl)
                    return None

                if 'Content-Length' in resp.headers and resp.headers[
                        'Content-Length'] != None:
                    if (int(resp.headers['Content-Length']) < int(
                            outputConfig['minimumContentLength'])):
                        logger.info(
                            "Image Content-Length is less than the minimum (" +
                            outputConfig['minimumContentLength'] + ")")
                        return None
                    else:
                        urllib.request.urlretrieve(imgUrl, buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], \
                            getOutputFolder(outputConfig), file_name))
                        logger.debug("image saved(" +
                                     resp.headers['Content-Type'] +
                                     "), filename: " + file_name)
                else:
                    logger.error("Content-Length == None for URL: " + imgUrl)
            except requests.exceptions.ConnectionError as e1:
                logger.error("A ConnectionError occurred while requesting " +
                             imgUrl)
                logError(e1)
            except requests.exceptions.TooManyRedirects as e2:
                logger.error("A TooManyRedirects occurred while requesting " +
                             imgUrl)
                logError(e2)
            except urllib.error.HTTPError as e3:
                logger.error(
                    "A urllib.error.HTTPError occurred while requesting " +
                    imgUrl)
                logError(e3)
Пример #2
0
def get_videos(soup, baseUrl, outputConfig, exceptionsConfig):
    localheaders = {'User-Agent': downloaderConfig.getUserAgent()}

    with requests.Session() as session:
        session.headers = localheaders

        try:
            localresponse = session.get(baseUrl)
            localsoup = BeautifulSoup(localresponse.content, "html.parser")

            # follow the iframe url
            iframes = localsoup.findAll('iframe', src=True)

            for soup_iframe in iframes:
                src = soup_iframe['src']
                if src:
                    iframeResponse = session.get(soup_iframe['src'], headers={'Referer': baseUrl})
                    iframeSoup = BeautifulSoup(iframeResponse.content, "html.parser")
                    iframeSources = [source for source in iframeSoup.findAll('source')]

                    for iframeSourceLink in iframeSources:
                        pos = iframeSourceLink['type'].rfind('/')
                        type = iframeSourceLink['type'][:pos]
                        extension = iframeSourceLink['type'][pos+1:]
                        if type == 'video':
                            result = downloadRequestsVideo3(iframeSourceLink['src'], extension, baseUrl, outputConfig)
        except requests.exceptions.ConnectionError as e1:
            logger.error("A ConnectionError occurred.")
            logError(e1)
        except requests.exceptions.TooManyRedirects as e2:
            logger.error("A TooManyRedirects occurred.")
            logError(e2)
        except urllib.error.HTTPError as e3:
            logger.error("A urllib.error.HTTPError occurred.")
            logError(e3)
        except requests.exceptions.MissingSchema as e4:
            logger.error("A requests.exceptions.MissingSchema occurred.")
            logError(e4)
        
        # Check whether the file is the minimum size and copies it if it meets the minimum size
        tmpFileLocation = buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], getOutputFolder(outputConfig), "")
        copyCounter = 0
        for eachVideo in os.listdir(tmpFileLocation):
            videoFile = tmpFileLocation+eachVideo
            if (os.path.getsize(videoFile) > (int(outputConfig['minimumFileSize'])*1000)):
                logger.info("File " + videoFile + " is greater than " + str(outputConfig['minimumFileSize']) + "k")
                # Copy the file to the actual location
                saved() # log a counter of a object being moved
                os.makedirs(getOutputFolder(outputConfig), exist_ok=True)
                shutil.move(videoFile, getOutputFolder(outputConfig)+eachVideo)
                copyCounter += 1
        # Delete the file
        shutil.rmtree(tmpFileLocation)
        logger.info("Temporary files and folder deleted.")
        logger.info(str(copyCounter)+" videos saved.")
Пример #3
0
def downloadRequestsImage3(imgUrl, refererUrl, outputConfig):
    """ Downloads a single image using the Requests package
    """
    if (imgUrl in urlsCache):
        logger.info("Cache Hit for: "+imgUrl)
        return None
    else:
        urlsCache.add(imgUrl)
        logger.info("urlsCache updated, current size: "+str(len(urlsCache)))
        
    if imgUrl != None:
        file_name = buildLocalFilename(imgUrl)
        if file_name != "":
            # retries = urllib3.util.retry.Retry(connect=5, read=2, redirect=5) # todo: fix hardcoding
            # req = urllib3.PoolManager(retries=retries)
            downloads() # log a counter of a download being made
            headers = {'user-agent': downloaderConfig.getUserAgent(), 'referer': refererUrl}
            try:
                resp = requests.get(imgUrl.strip(), headers=headers)
                printAccessLog(str(resp.status_code), resp.headers, imgUrl)
                if resp.status_code != requests.codes.ok:
                    logger.error("error response code: "+str(resp.status_code)+" for image URL: "+imgUrl)
                    return None
                
                if 'Content-Length' in resp.headers and resp.headers['Content-Length'] != None:
                    if (int(resp.headers['Content-Length']) < int(outputConfig['minimumContentLength'])):
                        logger.info("Image Content-Length is less than the minimum ("+outputConfig['minimumContentLength']+")")
                        return None
                    else:
                        urllib.request.urlretrieve(imgUrl, buildTmpFileOutputLocation(outputConfig['outputTmpFolderPrefix'], \
                            getOutputFolder(outputConfig), file_name))
                        logger.debug("image saved("+resp.headers['Content-Type']+"), filename: "+file_name)
                else:
                    logger.error("Content-Length == None for URL: "+imgUrl)
            except requests.exceptions.ConnectionError as e1:
                logger.error("A ConnectionError occurred while requesting "+imgUrl)
                logError(e1)
            except requests.exceptions.TooManyRedirects as e2:
                logger.error("A TooManyRedirects occurred while requesting "+imgUrl)
                logError(e2)
            except urllib.error.HTTPError as e3:
                logger.error("A urllib.error.HTTPError occurred while requesting "+imgUrl)
                logError(e3)
Пример #4
0
def get_videos(soup, baseUrl, outputConfig, exceptionsConfig):
    localheaders = {'User-Agent': downloaderConfig.getUserAgent()}

    with requests.Session() as session:
        session.headers = localheaders

        try:
            localresponse = session.get(baseUrl)
            localsoup = BeautifulSoup(localresponse.content, "html.parser")

            # follow the iframe url
            iframes = localsoup.findAll('iframe', src=True)

            for soup_iframe in iframes:
                src = soup_iframe['src']
                if src:
                    iframeResponse = session.get(soup_iframe['src'],
                                                 headers={'Referer': baseUrl})
                    iframeSoup = BeautifulSoup(iframeResponse.content,
                                               "html.parser")
                    iframeSources = [
                        source for source in iframeSoup.findAll('source')
                    ]

                    for iframeSourceLink in iframeSources:
                        pos = iframeSourceLink['type'].rfind('/')
                        type = iframeSourceLink['type'][:pos]
                        extension = iframeSourceLink['type'][pos + 1:]
                        if type == 'video':
                            result = downloadRequestsVideo3(
                                iframeSourceLink['src'], extension, baseUrl,
                                outputConfig)
        except requests.exceptions.ConnectionError as e1:
            logger.error("A ConnectionError occurred.")
            logError(e1)
        except requests.exceptions.TooManyRedirects as e2:
            logger.error("A TooManyRedirects occurred.")
            logError(e2)
        except urllib.error.HTTPError as e3:
            logger.error("A urllib.error.HTTPError occurred.")
            logError(e3)
        except requests.exceptions.MissingSchema as e4:
            logger.error("A requests.exceptions.MissingSchema occurred.")
            logError(e4)

        # Check whether the file is the minimum size and copies it if it meets the minimum size
        tmpFileLocation = buildTmpFileOutputLocation(
            outputConfig['outputTmpFolderPrefix'],
            getOutputFolder(outputConfig), "")
        copyCounter = 0
        for eachVideo in os.listdir(tmpFileLocation):
            videoFile = tmpFileLocation + eachVideo
            if (os.path.getsize(videoFile) >
                (int(outputConfig['minimumFileSize']) * 1000)):
                logger.info("File " + videoFile + " is greater than " +
                            str(outputConfig['minimumFileSize']) + "k")
                # Copy the file to the actual location
                saved()  # log a counter of a object being moved
                os.makedirs(getOutputFolder(outputConfig), exist_ok=True)
                shutil.move(videoFile,
                            getOutputFolder(outputConfig) + eachVideo)
                copyCounter += 1
        # Delete the file
        shutil.rmtree(tmpFileLocation)
        logger.info("Temporary files and folder deleted.")
        logger.info(str(copyCounter) + " videos saved.")