def build_http_cookie_table(database, verbose=False): """ Extracts all http-cookie data from HTTP headers and generates a new table """ con = sqlite3.connect(database) cur1 = con.cursor() cur2 = con.cursor() cur1.execute("CREATE TABLE IF NOT EXISTS http_request_cookies ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ crawl_id INTEGER NOT NULL, \ header_id INTEGER NOT NULL, \ name VARCHAR(200) NOT NULL, \ value TEXT NOT NULL, \ accessed DATETIME);") cur1.execute("CREATE TABLE IF NOT EXISTS http_response_cookies ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ crawl_id INTEGER NOT NULL, \ header_id INTEGER NOT NULL, \ name VARCHAR(200) NOT NULL, \ value TEXT NOT NULL, \ domain VARCHAR(500), \ path VARCHAR(500), \ expires DATETIME, \ max_age REAL, \ httponly BOOLEAN, \ secure BOOLEAN, \ comment VARCHAR(200), \ version VARCHAR(100), \ accessed DATETIME);") con.commit() # Parse http request cookies commit = 0 last_commit = 0 cur1.execute( "SELECT id, crawl_id, headers, time_stamp FROM http_requests_proxy \ WHERE id NOT IN (SELECT header_id FROM http_request_cookies)" ) row = cur1.fetchone() while row is not None: req_id, crawl_id, header_str, time_stamp = row header = ODictCaseless() try: header.load_state(json.loads(header_str)) except ValueError: #XXX temporary shim -- should be removed header.load_state(eval(header_str)) for cookie_str in header['Cookie']: queries = parse_cookies(cookie_str, verbose) for query in queries: cur2.execute( "INSERT INTO http_request_cookies \ (crawl_id, header_id, name, value, accessed) \ VALUES (?,?,?,?,?)", (crawl_id, req_id) + query + (time_stamp, )) commit += 1 if commit % 10000 == 0 and commit != 0 and commit != last_commit: last_commit = commit con.commit() if verbose: print str(commit) + " Cookies Processed" row = cur1.fetchone() con.commit() print "Processing HTTP Request Cookies Complete" # Parse http response cookies commit = 0 last_commit = 0 cur1.execute( "SELECT id, crawl_id, url, headers, time_stamp FROM http_responses_proxy \ WHERE id NOT IN (SELECT header_id FROM http_response_cookies)" ) row = cur1.fetchone() while row is not None: resp_id, crawl_id, req_url, header_str, time_stamp = row header = ODictCaseless() try: header.load_state(json.loads(header_str)) except ValueError: #XXX temporary shim -- should be removed header.load_state(eval(header_str)) for cookie_str in header['Set-Cookie']: queries = parse_cookies(cookie_str, verbose, url=req_url, response_cookie=True) for query in queries: cur2.execute( "INSERT INTO http_response_cookies \ (crawl_id, header_id, name, \ value, domain, path, expires, max_age, \ httponly, secure, comment, version, accessed) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", (crawl_id, resp_id) + query + (time_stamp, )) commit += 1 if commit % 10000 == 0 and commit != 0 and commit != last_commit: last_commit = commit con.commit() if verbose: print str(commit) + " Cookies Processed" row = cur1.fetchone() con.commit() print "Processing HTTP Response Cookies Complete" con.close()
def build_http_cookie_table(database, verbose=False): """ Extracts all http-cookie data from HTTP headers and generates a new table """ con = sqlite3.connect(database) cur1 = con.cursor() cur2 = con.cursor() cur1.execute("CREATE TABLE IF NOT EXISTS http_request_cookies ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ crawl_id INTEGER NOT NULL, \ header_id INTEGER NOT NULL, \ name VARCHAR(200) NOT NULL, \ value TEXT NOT NULL, \ accessed DATETIME);") cur1.execute("CREATE TABLE IF NOT EXISTS http_response_cookies ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ crawl_id INTEGER NOT NULL, \ header_id INTEGER NOT NULL, \ name VARCHAR(200) NOT NULL, \ value TEXT NOT NULL, \ domain VARCHAR(500), \ path VARCHAR(500), \ expires DATETIME, \ max_age REAL, \ httponly BOOLEAN, \ secure BOOLEAN, \ comment VARCHAR(200), \ version VARCHAR(100), \ accessed DATETIME);") con.commit() # Parse http request cookies commit = 0 last_commit = 0 cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests \ WHERE id NOT IN (SELECT header_id FROM http_request_cookies)") row = cur1.fetchone() while row is not None: req_id, crawl_id, header_str, time_stamp = row header = ODictCaseless() try: header.load_state(json.loads(header_str)) except ValueError: #XXX temporary shim -- should be removed header.load_state(eval(header_str)) for cookie_str in header['Cookie']: queries = parse_cookies(cookie_str, verbose) for query in queries: cur2.execute("INSERT INTO http_request_cookies \ (crawl_id, header_id, name, value, accessed) \ VALUES (?,?,?,?,?)", (crawl_id, req_id)+query+(time_stamp,)) commit += 1 if commit % 10000 == 0 and commit != 0 and commit != last_commit: last_commit = commit con.commit() if verbose: print str(commit) + " Cookies Processed" row = cur1.fetchone() con.commit() print "Processing HTTP Request Cookies Complete" # Parse http response cookies commit = 0 last_commit = 0 cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses \ WHERE id NOT IN (SELECT header_id FROM http_response_cookies)") row = cur1.fetchone() while row is not None: resp_id, crawl_id, req_url, header_str, time_stamp = row header = ODictCaseless() try: header.load_state(json.loads(header_str)) except ValueError: #XXX temporary shim -- should be removed header.load_state(eval(header_str)) for cookie_str in header['Set-Cookie']: queries = parse_cookies(cookie_str, verbose, url=req_url, response_cookie=True) for query in queries: cur2.execute("INSERT INTO http_response_cookies \ (crawl_id, header_id, name, \ value, domain, path, expires, max_age, \ httponly, secure, comment, version, accessed) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", (crawl_id, resp_id)+query+(time_stamp,)) commit += 1 if commit % 10000 == 0 and commit != 0 and commit != last_commit: last_commit = commit con.commit() if verbose: print str(commit) + " Cookies Processed" row = cur1.fetchone() con.commit() print "Processing HTTP Response Cookies Complete" con.close()