예제 #1
0
def build_http_cookie_table(database, verbose=False):
    """ Extracts all http-cookie data from HTTP headers and generates a new table """
    con = sqlite3.connect(database)
    cur1 = con.cursor()
    cur2 = con.cursor()

    cur1.execute("CREATE TABLE IF NOT EXISTS http_request_cookies ( \
                    id INTEGER PRIMARY KEY AUTOINCREMENT, \
                    crawl_id INTEGER NOT NULL, \
                    header_id INTEGER NOT NULL, \
                    name VARCHAR(200) NOT NULL, \
                    value TEXT NOT NULL, \
                    accessed DATETIME);")
    cur1.execute("CREATE TABLE IF NOT EXISTS http_response_cookies ( \
                    id INTEGER PRIMARY KEY AUTOINCREMENT, \
                    crawl_id INTEGER NOT NULL, \
                    header_id INTEGER NOT NULL, \
                    name VARCHAR(200) NOT NULL, \
                    value TEXT NOT NULL, \
                    domain VARCHAR(500), \
                    path VARCHAR(500), \
                    expires DATETIME, \
                    max_age REAL, \
                    httponly BOOLEAN, \
                    secure BOOLEAN, \
                    comment VARCHAR(200), \
                    version VARCHAR(100), \
                    accessed DATETIME);")
    con.commit()

    # Parse http request cookies
    commit = 0
    last_commit = 0
    cur1.execute(
        "SELECT id, crawl_id, headers, time_stamp FROM http_requests_proxy \
                    WHERE id NOT IN (SELECT header_id FROM http_request_cookies)"
    )
    row = cur1.fetchone()
    while row is not None:
        req_id, crawl_id, header_str, time_stamp = row
        header = ODictCaseless()
        try:
            header.load_state(json.loads(header_str))
        except ValueError:  #XXX temporary shim -- should be removed
            header.load_state(eval(header_str))
        for cookie_str in header['Cookie']:
            queries = parse_cookies(cookie_str, verbose)
            for query in queries:
                cur2.execute(
                    "INSERT INTO http_request_cookies \
                            (crawl_id, header_id, name, value, accessed) \
                            VALUES (?,?,?,?,?)",
                    (crawl_id, req_id) + query + (time_stamp, ))
                commit += 1
        if commit % 10000 == 0 and commit != 0 and commit != last_commit:
            last_commit = commit
            con.commit()
            if verbose: print str(commit) + " Cookies Processed"
        row = cur1.fetchone()
    con.commit()
    print "Processing HTTP Request Cookies Complete"

    # Parse http response cookies
    commit = 0
    last_commit = 0
    cur1.execute(
        "SELECT id, crawl_id, url, headers, time_stamp FROM http_responses_proxy \
                    WHERE id NOT IN (SELECT header_id FROM http_response_cookies)"
    )
    row = cur1.fetchone()
    while row is not None:
        resp_id, crawl_id, req_url, header_str, time_stamp = row
        header = ODictCaseless()
        try:
            header.load_state(json.loads(header_str))
        except ValueError:  #XXX temporary shim -- should be removed
            header.load_state(eval(header_str))
        for cookie_str in header['Set-Cookie']:
            queries = parse_cookies(cookie_str,
                                    verbose,
                                    url=req_url,
                                    response_cookie=True)
            for query in queries:
                cur2.execute(
                    "INSERT INTO http_response_cookies \
                            (crawl_id, header_id, name, \
                            value, domain, path, expires, max_age, \
                            httponly, secure, comment, version, accessed) \
                            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                    (crawl_id, resp_id) + query + (time_stamp, ))
                commit += 1
        if commit % 10000 == 0 and commit != 0 and commit != last_commit:
            last_commit = commit
            con.commit()
            if verbose: print str(commit) + " Cookies Processed"
        row = cur1.fetchone()
    con.commit()
    print "Processing HTTP Response Cookies Complete"
    con.close()
예제 #2
0
def build_http_cookie_table(database, verbose=False):
    """ Extracts all http-cookie data from HTTP headers and generates a new table """
    con = sqlite3.connect(database)
    cur1 = con.cursor()
    cur2 = con.cursor()

    cur1.execute("CREATE TABLE IF NOT EXISTS http_request_cookies ( \
                    id INTEGER PRIMARY KEY AUTOINCREMENT, \
                    crawl_id INTEGER NOT NULL, \
                    header_id INTEGER NOT NULL, \
                    name VARCHAR(200) NOT NULL, \
                    value TEXT NOT NULL, \
                    accessed DATETIME);")
    cur1.execute("CREATE TABLE IF NOT EXISTS http_response_cookies ( \
                    id INTEGER PRIMARY KEY AUTOINCREMENT, \
                    crawl_id INTEGER NOT NULL, \
                    header_id INTEGER NOT NULL, \
                    name VARCHAR(200) NOT NULL, \
                    value TEXT NOT NULL, \
                    domain VARCHAR(500), \
                    path VARCHAR(500), \
                    expires DATETIME, \
                    max_age REAL, \
                    httponly BOOLEAN, \
                    secure BOOLEAN, \
                    comment VARCHAR(200), \
                    version VARCHAR(100), \
                    accessed DATETIME);")
    con.commit()

    # Parse http request cookies
    commit = 0
    last_commit = 0
    cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests \
                    WHERE id NOT IN (SELECT header_id FROM http_request_cookies)")
    row = cur1.fetchone()
    while row is not None:
        req_id, crawl_id, header_str, time_stamp = row
        header = ODictCaseless()
        try:
            header.load_state(json.loads(header_str))
        except ValueError: #XXX temporary shim -- should be removed
            header.load_state(eval(header_str))
        for cookie_str in header['Cookie']:
            queries = parse_cookies(cookie_str, verbose)
            for query in queries:
                cur2.execute("INSERT INTO http_request_cookies \
                            (crawl_id, header_id, name, value, accessed) \
                            VALUES (?,?,?,?,?)",
                            (crawl_id, req_id)+query+(time_stamp,))
                commit += 1
        if commit % 10000 == 0 and commit != 0 and commit != last_commit:
            last_commit = commit
            con.commit()
            if verbose: print str(commit) + " Cookies Processed"
        row = cur1.fetchone()
    con.commit()
    print "Processing HTTP Request Cookies Complete"

    # Parse http response cookies
    commit = 0
    last_commit = 0
    cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses \
                    WHERE id NOT IN (SELECT header_id FROM http_response_cookies)")
    row = cur1.fetchone()
    while row is not None:
        resp_id, crawl_id, req_url, header_str, time_stamp = row
        header = ODictCaseless()
        try:
            header.load_state(json.loads(header_str))
        except ValueError: #XXX temporary shim -- should be removed
            header.load_state(eval(header_str))
        for cookie_str in header['Set-Cookie']:
            queries = parse_cookies(cookie_str, verbose, url=req_url, response_cookie=True)
            for query in queries:
                cur2.execute("INSERT INTO http_response_cookies \
                            (crawl_id, header_id, name, \
                            value, domain, path, expires, max_age, \
                            httponly, secure, comment, version, accessed) \
                            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                            (crawl_id, resp_id)+query+(time_stamp,))
                commit += 1
        if commit % 10000 == 0 and commit != 0 and commit != last_commit:
            last_commit = commit
            con.commit()
            if verbose: print str(commit) + " Cookies Processed"
        row = cur1.fetchone()
    con.commit()
    print "Processing HTTP Response Cookies Complete"
    con.close()