Пример #1
0
def DownloadSubAsBytesIO(domain, url, referer=None, cookies=None):
    """ Download a url, and return as file-like object (Bytes). Use the referer
        parameter if the site require such parameter in the header in order to
        download the file
    """

    # urlopen accepts only a url, so we need to join the domain and url
    # into a one string. Also, the type is needed, therefor the "http://"
    full_url = ""
    if url.startswith("http://"):
        full_url = url
    elif url.startswith("www."):
        full_url = "http://" + url
    else:
        full_url = "http://%s" % domain + url
    WriteDebug("Getting file: %s" % full_url)

    file_content = BytesIO()
    # Set the referer to be the domain because some site will avoid our request
    # if we don't do that
    request_headers = {
        "User-Agent": UserAgents.getAgent(),
        # Set the referer to be the domain if None passed
        "Referer": referer or domain,
    }
    if cookies:
        request_headers.update({"Cookie": cookies})

    file_request = Request(full_url, headers=request_headers)
    try:
        url_opened = urlopen(file_request)
        file_content.write(url_opened.read())
    except Exception as eX:
        WriteDebug("Failed getting file: %s->%s" % (full_url, eX))
        return None
    return file_content
Пример #2
0
def PerformRequest(domain, url, data="", type=HttpRequestTypes.GET, more_headers="", retry=False, is_redirection=False):
    """ 
        Performs http requests. We are using fake user-agents. Use the data arg
        in case you send a "POST" request. Also, you can specify more headers 
        by supplying a dict in the more_headers arg

        Url should start with "/". If not, the function adds it.
    """
    import UserAgents

    response = ""
    if not url.startswith("/"):
        url = "/" + url
    try:
        httpcon = httplib.HTTPConnection(domain, timeout=10)

        headers = {}
        # Each packet we send will have this params (good for hiding)
        if not retry:
            headers = {
                "Connection": r"keep-alive",
                "User-Agent": UserAgents.getAgent(),
                "X-Requested-With": r"XMLHttpRequest",
                "Content-Type": r"application/x-www-form-urlencoded",
                "Accept-Charset": r"utf-8;q=0.7,*;q=0.3",
                "Accept-Language": r"en-US,en;q=0.8",
                "Cache-Control": r"max-age=0",
            }
        else:
            # The Fake User Agent
            headers = {"User-Agent": UserAgents.getAgent()}

        # In case of specifiyng more headers, we add them
        if len(more_headers):
            headers.update(more_headers)

        WriteDebug("Sending request for: %s" % (domain + url))
        got_respone = None
        response = None
        # Try 3 times.
        for error_count in range(1, 4):
            try:
                # Before each request, we need to try and connect, because
                # we're probably not connected (that's way the exception that
                # we're catching was raised).
                httpcon.connect()
                httpcon.request(type, url, str(data), headers)
                got_response = httpcon.getresponse()
                response = got_response.read()
                # If we got the response, break the loop.
                break
            except Exception as error:
                WriteDebug("Failed sending the request for the %d time: %s" % (error_count, error))
                # Sleep some time before we request it again.
                sleep(2)
                # Close it (we're calling connect() again inside the try).
                httpcon.close()

        # In order to avoid decoding problems, we just convert the bytes to
        # str. The problem is that when we do that, the str preserve the
        # preceding 'b' of the bytes type, so we remove it, and the single
        # quotes and the start and the end of the string
        try:
            response = response.decode("utf-8", errors="replace")
        except:
            response = str(response)[2:-1]
        response = response.replace("\r", "").replace("\n", "")
        # When we get and empty response, it might be a sign that we got a
        # redirection and therefor we check the current url against the
        # requested one. Also, if is_redirection is true, it means that we
        # already got redirection, and therefor we stop the procedure
        if not response and not is_redirection:
            new_url = got_response.msg.dict["location"]
            if url not in new_url:
                WriteDebug("Got Redirection: %s->%s" % (url, new_url))
                # Because the location gives us the full address including the
                # protocol and the domain, we remove them in order to get the
                # relative url
                new_url = new_url.replace("http://", "").replace(domain, "")
                return PerformRequest(domain, new_url, is_redirection=True)
    except Exception as eX:
        WriteDebug("Failed Getting: %s->%s [%s]" % (domain, url, eX))

    return response