Exemplo n.º 1
0
    def scrapeURL(self, url):
        returnArr = []

        self.siteUrl = url

        if config.get('scraper_mode') == 'online':
            headersArr = {}
            scrapedRawData = Network.fetch(url, headersArr)
            if (scrapedRawData['code'] == 200):
                self.siteHeaders = scrapedRawData['headers']['requested']
                self.siteHeaders['referer'] = self.siteUrl

                self.scrapedRawData = scrapedRawData['body']
        elif config.get('scraper_mode') == 'offline':
            filePath = os.path.realpath(__file__)
            currentFileName = os.path.basename(__file__)
            filePath = filePath.replace(currentFileName, '')
            file = open(f"{filePath}/sample_data/{url}")
            self.scrapedRawData = file.read()

        if self.scrapedRawData is not None:
            result = self.processRawData()
            returnArr = result

        return returnArr
Exemplo n.º 2
0
    def fetch(url, headersArr):
        returnArr = {"code": 0}
        try:
            if len(headersArr) == 0:
                useragent = UserAgent()
                headersArr.update(useragent.getRandom())

            if config.get('proxy_enabled') is True:
                proxies = {'https': config.get('proxy_url_ip')}
                response = requests.get(url,
                                        headers=headersArr,
                                        proxies=proxies)
            else:
                response = requests.get(url, headers=headersArr)

            returnArr = {
                "code": response.status_code,
                "headers": {
                    "requested": headersArr,
                    "received": response.headers
                },
                "body": response.text
            }
        except Exception as e:
            tb = sys.exc_info()[2]
            print(e.with_traceback(tb))

        return returnArr