def DownloadSubAsBytesIO(domain, url, referer=None, cookies=None): """ Download a url, and return as file-like object (Bytes). Use the referer parameter if the site require such parameter in the header in order to download the file """ # urlopen accepts only a url, so we need to join the domain and url # into a one string. Also, the type is needed, therefor the "http://" full_url = "" if url.startswith("http://"): full_url = url elif url.startswith("www."): full_url = "http://" + url else: full_url = "http://%s" % domain + url WriteDebug("Getting file: %s" % full_url) file_content = BytesIO() # Set the referer to be the domain because some site will avoid our request # if we don't do that request_headers = { "User-Agent": UserAgents.getAgent(), # Set the referer to be the domain if None passed "Referer": referer or domain, } if cookies: request_headers.update({"Cookie": cookies}) file_request = Request(full_url, headers=request_headers) try: url_opened = urlopen(file_request) file_content.write(url_opened.read()) except Exception as eX: WriteDebug("Failed getting file: %s->%s" % (full_url, eX)) return None return file_content
def PerformRequest(domain, url, data="", type=HttpRequestTypes.GET, more_headers="", retry=False, is_redirection=False): """ Performs http requests. We are using fake user-agents. Use the data arg in case you send a "POST" request. Also, you can specify more headers by supplying a dict in the more_headers arg Url should start with "/". If not, the function adds it. """ import UserAgents response = "" if not url.startswith("/"): url = "/" + url try: httpcon = httplib.HTTPConnection(domain, timeout=10) headers = {} # Each packet we send will have this params (good for hiding) if not retry: headers = { "Connection": r"keep-alive", "User-Agent": UserAgents.getAgent(), "X-Requested-With": r"XMLHttpRequest", "Content-Type": r"application/x-www-form-urlencoded", "Accept-Charset": r"utf-8;q=0.7,*;q=0.3", "Accept-Language": r"en-US,en;q=0.8", "Cache-Control": r"max-age=0", } else: # The Fake User Agent headers = {"User-Agent": UserAgents.getAgent()} # In case of specifiyng more headers, we add them if len(more_headers): headers.update(more_headers) WriteDebug("Sending request for: %s" % (domain + url)) got_respone = None response = None # Try 3 times. for error_count in range(1, 4): try: # Before each request, we need to try and connect, because # we're probably not connected (that's way the exception that # we're catching was raised). httpcon.connect() httpcon.request(type, url, str(data), headers) got_response = httpcon.getresponse() response = got_response.read() # If we got the response, break the loop. break except Exception as error: WriteDebug("Failed sending the request for the %d time: %s" % (error_count, error)) # Sleep some time before we request it again. sleep(2) # Close it (we're calling connect() again inside the try). httpcon.close() # In order to avoid decoding problems, we just convert the bytes to # str. The problem is that when we do that, the str preserve the # preceding 'b' of the bytes type, so we remove it, and the single # quotes and the start and the end of the string try: response = response.decode("utf-8", errors="replace") except: response = str(response)[2:-1] response = response.replace("\r", "").replace("\n", "") # When we get and empty response, it might be a sign that we got a # redirection and therefor we check the current url against the # requested one. Also, if is_redirection is true, it means that we # already got redirection, and therefor we stop the procedure if not response and not is_redirection: new_url = got_response.msg.dict["location"] if url not in new_url: WriteDebug("Got Redirection: %s->%s" % (url, new_url)) # Because the location gives us the full address including the # protocol and the domain, we remove them in order to get the # relative url new_url = new_url.replace("http://", "").replace(domain, "") return PerformRequest(domain, new_url, is_redirection=True) except Exception as eX: WriteDebug("Failed Getting: %s->%s [%s]" % (domain, url, eX)) return response