def test_create_asset_timeline(har_data): """ Tests the asset timeline function by making sure that it inserts one object correctly. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # Get the datetime object of the start time and total load time time_key = dateutil.parser.parse(entry['startedDateTime']) load_time = int(entry['time']) asset_timeline = har_parser.create_asset_timeline([entry]) # The number of entries in the timeline should match the load time assert len(asset_timeline) == load_time for t in range(1, load_time): assert time_key in asset_timeline assert len(asset_timeline[time_key]) == 1 # Compare the dicts for key, value in iteritems(entry): assert asset_timeline[time_key][0][key] == entry[key] time_key = time_key + datetime.timedelta(milliseconds=1)
def test_create_asset_timeline(har_data): """ Tests the asset timeline function by making sure that it inserts one object correctly. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # Get the datetime object of the start time and total load time time_key = dateutil.parser.parse(entry['startedDateTime']) load_time = int(entry['time']) asset_timeline = har_parser.create_asset_timeline([entry]) # The number of entries in the timeline should match the load time assert len(asset_timeline) == load_time for t in range(1, load_time): assert time_key in asset_timeline assert len(asset_timeline[time_key]) == 1 # Compare the dicts for key, value in entry.iteritems(): assert asset_timeline[time_key][0][key] == entry[key] time_key = time_key + datetime.timedelta(milliseconds=1)
def test_init(har_data): # Make sure we only tolerate valid input with pytest.raises(ValueError): har_parser = HarParser('please_dont_work') assert har_parser har_data = har_data('humanssuck.net.har') har_parser = HarParser(har_data) for page in har_parser.pages: assert isinstance(page, HarPage) assert har_parser.browser == {'name': 'Firefox', 'version': '25.0.1'} assert har_parser.version == '1.1' assert har_parser.creator == {'name': 'Firebug', 'version': '1.12'}
def test_match_status_code(har_data): """ Tests the ability of the parser to match status codes. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_status_code(entry, '2.*') assert not har_parser.match_status_code(entry, '3.*') # TEST LITERAL STRING MATCH # assert har_parser.match_status_code(entry, '200', regex=False) assert not har_parser.match_status_code(entry, '201', regex=False)
def test_http_version(har_data): """ Tests the ability of the parser to match status codes. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_http_version(entry, '.*1.1') assert not har_parser.match_http_version(entry, '.*2') # TEST LITERAL STRING MATCH # assert har_parser.match_http_version(entry, 'HTTP/1.1', regex=False) assert not har_parser.match_http_version(entry, 'HTTP/2.0', regex=False)
def test_init(har_data): """ Test the object loading """ with pytest.raises(ValueError): page = HarPage(PAGE_ID) init_data = har_data('humanssuck.net.har') # Throws PageNotFoundException with bad page ID with pytest.raises(PageNotFoundError): page = HarPage(BAD_PAGE_ID, har_data=init_data) # Make sure it can load with either har_data or a parser page = HarPage(PAGE_ID, har_data=init_data) assert isinstance(page, HarPage) parser = HarParser(init_data) page = HarPage(PAGE_ID, har_parser=parser) assert isinstance(page, HarPage) assert len(page.entries) == 4 # Make sure that the entries are actually in order. Going a little bit # old school here. for index in range(0, len(page.entries)): if index != len(page.entries) - 1: current_date = dateutil.parser.parse( page.entries[index]['startedDateTime']) next_date = dateutil.parser.parse( page.entries[index + 1]['startedDateTime']) assert current_date <= next_date
def scan_files(path): data = [] # Parse all files in directory for filename in os.listdir(path): with open(os.path.join(path, filename), 'r') as f: har_parser = HarParser(json.loads(f.read())) start_time = dateutil.parser.parse(har_parser.pages[0].entries[0]["startedDateTime"]) latest_time = start_time # Parse all resources HTML, CSS, JS... for entry in (har_parser.pages[0].entries): if entry["time"] == None: s = 0 else: s = float(entry["time"])/1000 current_time = dateutil.parser.parse(entry["startedDateTime"]) + datetime.timedelta(seconds = s) if (current_time > latest_time): latest_time = current_time total = latest_time - start_time # if total < datetime.timedelta(seconds = 1000): # os.remove(os.path.join(path, filename)) # print(filename) data.append(total.total_seconds()*1000) return data
def check_service_in_har(har_data, service_name): logging.info('Checking for service -->'+ service_name) har_parser = HarParser(json.loads(har_data)) for x in har_parser.har_data['entries']: if x['request']['url'] == service_name: logging.info('got service -> '+service_name) return True
def save_har_to_csv(test, testname, service_list, desc_list): import csv harname = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir)), 'temp', testname+'.har') csv_name = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir)), 'temp', testname+'.csv') if os.path.exists(csv_name): os.remove(csv_name) with open(harname, 'r') as f: har_parser = HarParser(json.loads(f.read())) with open(csv_name, mode='x') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', lineterminator='\n') csv_writer.writerow(['desc','url','status', 'response_type','time', 'starttime']) for x in har_parser.har_data['entries']: if x['request']['url'] in service_list: desc = desc_list[service_list.index(x['request']['url'])] url = x['request']['url'] status = x['response']['status'] time = x['time'] start = x['startedDateTime'] csv_writer.writerow([desc, url, status, 'actual', time, start]) #write expected to csv for x in test['api']: csv_writer.writerow([x['description'], x['servicename'], x['status_code'], 'expected', x['expectedresponseinms'], 0]) return csv_name
def __get_page_content_from_har(self): with open(self.har_path, "r") as f: har_parser = HarParser(json.loads(f.read())) for page in har_parser.pages[:1]: for file in page.html_files: return file["response"]["content"]["text"] raise Exception("Unable to access HAR file.")
def harparser(self): """ Captures the har and converts to a HarParser object :return: HarParser object, a page from har capture """ result_har = json.dumps(self._client.har, ensure_ascii=False) har_parser = HarParser(json.loads(result_har)) return har_parser.pages[0]
def test_match_request_type(har_data): """ Tests the ability of the parser to match a request type. """ # The HarParser does not work without a full har file, but we only want # to test a piece, so this initial load is just so we can get the object # loaded, we don't care about the data in that HAR file. init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_request_type(entry, '.*ET') assert not har_parser.match_request_type(entry, '.*ST') # TEST LITERAL STRING MATCH # assert har_parser.match_request_type(entry, 'GET', regex=False) assert not har_parser.match_request_type(entry, 'POST', regex=False)
def get_pages(self): if self.pages: return self.pages try: if 'har' not in self: return [] har_parser = HarParser(self['har']) self.pages = har_parser.pages return self.pages except Exception as e: logging.warning('Saw exception when parsing HAR: {}'.format(e)) return []
def setHeadersFromHarFile(self, fileName, urlMustContain): if not os.path.exists(fileName): return try: from pathlib import Path headersList = [] if Path(fileName).suffix == '.har': from haralyzer import HarParser file = helpers.getFile(fileName) j = json.loads(file) har_page = HarParser(har_data=j) # find the right url for page in har_page.pages: for entry in page.entries: if urlMustContain in entry['request']['url']: headersList = entry['request']['headers'] break else: headersList = helpers.getJsonFile(fileName) headersList = get(headersList, 'headers') headers = [] for header in headersList: name = header.get('name', '') value = header.get('value', '') # ignore pseudo-headers if name.startswith(':'): continue if name.lower() == 'content-length' or name.lower() == 'host': continue # otherwise response will stay compressed and unreadable if name.lower() == 'accept-encoding' and not self.hasBrotli: value = value.replace(', br', '') newHeader = (name, value) headers.append(newHeader) self.headers = OrderedDict(headers) except Exception as e: helpers.handleException(e)
def main(args): logging.basicConfig(level=args.level) with open(args.archive, "r", encoding="utf-8") as f: body = json.load(f) har_parser = HarParser(body) from visitors import HttpArchiveVisitor visitor = HttpArchiveVisitor() visitor.visit(har_parser) visitor.summarize()
def parse_file(f): har_parser = HarParser(json.loads(f)) rows = [['X-CACHE-HEADER', 'BYTES', 'URL']] hosts = {} size = {} total_bytes = 0 #total bytes for all content across the entire thing for page in har_parser.pages: assert isinstance(page, HarPage) for entry in page.entries: cdn = [] headers = entry['response']['headers'] #print(entry['response'], file=sys.stderr) cdn_str = None total_bytes += entry['response']['content']['size'] #pp.pprint(entry['request']) url = urlparse(entry['request']['url']) for h in headers: if (h['name'] == 'x-cache'): hosts[url.netloc] = 1 #print(url, file=sys.stderr) cdn_str = h['value'] cdn.append(cdn_str) if (cdn_str in size): size[cdn_str] = size[cdn_str] + entry['response']['content'][ 'size'] else: size[cdn_str] = entry['response']['content']['size'] print("\t".join([ str(cdn), str(entry['response']['content']['size']), entry['request']['url'], url.netloc ])) rows.append([ cdn, entry['response']['content']['size'], linkify(entry['request']['url']) ]) bysize = [['CACHE TAG', '% OF BYTES']] for sk in size.keys(): bysize.append([sk, "{:.1%}".format(size[sk] / total_bytes)]) bysize_t = list(map(list, zip(*bysize))) hosts_t = list(map(list, zip(*[hosts.keys()]))) return { 'total_bytes': total_bytes, 'hosts_t': hosts_t, 'bysize': bysize, 'rows': rows }
def get_entries(filename: str, entry_id: int = None) -> (dict, list): """Gets either all the entries or a certain one""" with open( os.path.join(os.getenv("UPLOAD_FOLDER", "/tmp"), filename), # nosec "r", encoding="utf-8", ) as process_file: render_pages = HarParser(json.loads(process_file.read())).pages items = [entry for page in render_pages for entry in page.entries] if isinstance(entry_id, int): return items[entry_id] return items
def test_init_entry_with_no_pageref(har_data): ''' If we find an entry with no pageref it should end up in a HarPage object with page ID of unknown ''' data = har_data('missing_pageref.har') har_parser = HarParser(data) # We should have two pages. One is defined in the pages key of the har file # but has no entries. The other should be our unknown page, with a single # entry assert len(har_parser.pages) == 2 page = [p for p in har_parser.pages if p.page_id == 'unknown'][0] assert len(page.entries) == 1
def capture_url_traffic(self, url, wait_time=0): """ Capture the har for a given url :param str url: url to capture traffic for :param int wait_time: time to wait after the page load :return: HarParser object, a page from har capture """ self._client.new_har(options={'captureHeaders': True}) self._driver.goto_url(url, absolute=True) time.sleep(wait_time) result_har = json.dumps(self._client.har, ensure_ascii=False) har_parser = HarParser(json.loads(result_har)) return har_parser.pages[0]
def setHeadersFromHarFile(self, fileName, urlMustContain): try: from pathlib import Path headersList = [] if Path(fileName).suffix == '.har': from haralyzer import HarParser file = helpers.getFile(fileName) j = json.loads(file) har_page = HarParser(har_data=j) # find the right url for page in har_page.pages: for entry in page.entries: if urlMustContain in entry['request']['url']: headersList = entry['request']['headers'] break else: headersList = helpers.getJsonFile(fileName) headersList = get(headersList, 'headers') headers = [] for header in headersList: name = header.get('name', '') # ignore pseudo-headers if name.startswith(':'): continue if name.lower() == 'content-length' or name.lower() == 'host': continue newHeader = (name, header.get('value', '')) headers.append(newHeader) self.headers = OrderedDict(headers) except Exception as e: helpers.handleException(e)
def extract_adobe_from_har(file_path_to_har_file): list_to_print = [] with open(file_path_to_har_file, "r") as f: har_parser = HarParser(json.loads(f.read())) for har_page in har_parser.pages: ## POST requests post_requests = har_page.post_requests # filter for adobe hits adobe_post_hits = [] for request in post_requests: if "https://woolworthsfoodgroup.sc.omtrdc" in request["request"]["url"]: adobe_post_hits.append(request) # print(json.dumps(request, indent=4)) for adobe_post_hit in adobe_post_hits: query = parse_query_string(adobe_post_hit["request"]["postData"]["text"]) list_to_print.append(query) ## GET requests get_requests = har_page.get_requests # filter adobe requests for request in get_requests: if "https://woolworthsfoodgroup.sc.omtrdc" in request["request"]["url"]: # print(request["request"]["url"]) my_url = request["request"]["url"] parsed = urllib.parse.urlparse(my_url) data_sent = urllib.parse.unquote(str(parsed.query)) query = parse_query_string(parsed.query) list_to_print.append(query) new_list = sorted(list_to_print, key=lambda k: k["t"]) return new_list
def parse_har_file(har_file): """ Parse a HAR file into a list of request objects This currently filters requests by content_type (text/html) """ har_parser = HarParser(json.load(har_file)) requests = [] for page in har_parser.pages: entries = page.filter_entries(content_type='text/html') for entry in entries: entry_request = entry['request'] request_base_url = "{0.scheme}://{0.netloc}".format( urlsplit(entry_request['url'])) request = { 'method': entry_request['method'], 'url': entry_request['url'].replace(request_base_url, ""), 'datetime': dateutil.parser.parse(entry['startedDateTime']), } if entry_request['method'] == 'POST': request['data'] = { unquote_plus(item['name']): unquote_plus(item['value']) for item in entry_request['postData']['params'] } request['data'].pop('csrf_token', None) requests.append(request) requests.sort(key=itemgetter('datetime')) for request in requests: request.pop('datetime', None) return {'requests': requests}
def get_response_contents_from_har(har_path): response_contents = defaultdict(str) with open(har_path, 'r') as f: try: har_parser = HarParser(json.loads(f.read())) except ValueError: return response_contents for page in har_parser.pages: for entry in page.entries: try: url = entry["request"]["url"] base_url = url.split("?")[0].split("#")[0] mime_type = entry["response"]["content"]["mimeType"] if "image" in mime_type or "font" in mime_type or \ "css" in mime_type: continue # print mime_type body = entry["response"]["content"]["text"] # print url, body[:128] # response_contents.append((url, body)) response_contents[base_url] += ("\n======\n" + body) except Exception: pass return response_contents
def get_info_from_har(file_path): with open(file_path, 'r', encoding='UTF8') as f: har_parser = HarParser(json.loads(f.read())) method = har_parser.pages[0].actual_page['request']['method'] url = har_parser.pages[0].actual_page['request']['url'] headers = {} for header in har_parser.pages[0].actual_page['request']['headers']: key = header['name'] value = header['value'] headers[key] = value queryString = har_parser.pages[0].actual_page['request']['queryString'] cookies = har_parser.pages[0].actual_page['request']['cookies'] context = { 'method': method, 'url': url, 'headers': headers, 'queryString': queryString, 'cookies': cookies } return context
def read_har(harfile): # Read harfile and return haralyzer parser with open(harfile, 'r') as f: har_parser = HarParser(json.loads(f.read())) return har_parser
import ast import json import csv from bs4 import BeautifulSoup from haralyzer import HarParser with open('../../Downloads/public.tableau.com_base_6.har', 'r') as f: # with open('../../Downloads/tahir-data/public.tableau.com_two.har', 'r') as f: # with open('../../Downloads/men_baseball/3-harib-mensbaseball.har', 'r') as f: # with open('../../Downloads/men_baseball/13full-harib-mensbaseball.har', 'r') as f: # with open('../../Downloads/soccer/mens_soccer_6.har', 'r') as f: # with open('../../Downloads/soccer/mens_soccer_8.har', 'r') as f: # with open('../../Downloads/soccer/mens_soccer_15.har', 'r') as f: har_parser = HarParser(json.loads(f.read())) tree_list = [] for page in har_parser.pages: for index, entry in enumerate(page.entries): if har_parser.match_request_type(entry, 'POST'): text_str = str(entry.get('response').get('content').get('text')) if text_str.__contains__( 'School Name:') and not text_str.__contains__('518965;'): data = json.loads(text_str.strip()) # data = text_str # import pdb;pdb.set_trace() # data = text_str # data = ast.literal_eval(entry.get('response').get('content').get('text')) print(text_str) cmdResultList = data.get('vqlCmdResponse').get('cmdResultList') print(cmdResultList) for index, i in enumerate(cmdResultList): # try:
def __init__(self, pydict): self.__har = pydict self.__har_parser = HarParser(pydict)
def test_match_headers(har_data): # The HarParser does not work without a full har file, but we only want # to test a piece, so this initial load is just so we can get the object # loaded, we don't care about the data in that HAR file. init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) raw_headers = har_data('single_entry.har') # Make sure that bad things happen if we don't give it response/request test_data = { 'captain beefheart': { 'accept': '.*text/html,application/xhtml.*', 'host': 'humanssuck.*', 'accept-encoding': '.*deflate', }, } with pytest.raises(ValueError): _headers_test(har_parser, raw_headers, test_data, True, True) # TEST THE REGEX FEATURE FIRST # # These should all be True test_data = { 'request': { 'accept': '.*text/html,application/xhtml.*', 'host': 'humanssuck.*', 'accept-encoding': '.*deflate', }, 'response': { 'server': 'nginx', 'content-type': 'text.*', 'connection': '.*alive', }, } _headers_test(har_parser, raw_headers, test_data, True, True) test_data = { 'request': { 'accept': '.*text/xml,application/xhtml.*', 'host': 'humansrule.*', 'accept-encoding': 'i dont accept that', }, 'response': { 'server': 'apache', 'content-type': 'application.*', 'connection': '.*dead', }, } _headers_test(har_parser, raw_headers, test_data, False, True) # Test literal string matches # # These should all be True test_data = { 'request': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'host': 'humanssuck.net', 'accept-encoding': 'gzip, deflate', }, 'response': { 'server': 'nginx', 'content-type': 'text/html; charset=UTF-8', 'connection': 'keep-alive', }, } _headers_test(har_parser, raw_headers, test_data, True, False) test_data = { 'request': { 'accept': 'I accept nothing', 'host': 'humansrule.guru', 'accept-encoding': 'i dont accept that', }, 'response': { 'server': 'apache', 'content-type': 'your mom', 'connection': 'not keep-alive', }, } _headers_test(har_parser, raw_headers, test_data, False, False)
from haralyzer import HarParser, HarPage from numpy import trapz import pandas as pd import asciiplotlib as apl # import matplotlib.pyplot as plt # Handle too many or not enough inputs if len(sys.argv) < 2: raise Exception("Error: need a path to HAR file as command-line argument") elif len(sys.argv) > 2: raise Exception("Error: gave too many command-line arguments") # Get HAR archive File name (as command-line argument) har = sys.argv[1] with open(har, 'r') as f: har_parser = HarParser(json.loads(f.read())) # Get onLoad per page load page_onLoad = [] for item in har_parser.har_data["pages"]: page_onLoad.append(item.get("pageTimings").get("onLoad")) # Get total in bytes for _bytesIn and _objectSize numPages = 0 total_bytesIn = [] total_objectSize = [] for page in har_parser.pages: numPages += 1 byteSize = objSize = 0 for entry in page.entries: byteSize += int(entry["_bytesIn"])
'WWW-Authenticate', 'X-Frame-Options', 'A-IM', 'Accept', 'Accept-Charset', 'Accept-Datetime', 'Accept-Encoding', 'Accept-Language', 'Access-Control-Request-Method', 'Access-Control-Request-Headers', 'Authorization', 'Cache-Control', 'Connection', 'Content-Length', 'Content-MD5', 'Content-Type', 'Cookie', 'Date', 'Expect', 'Forwarded', 'From', 'Host', 'HTTP2-Settings', 'If-Match', 'If-Modified-Since', 'If-None-Match', 'If-Range', 'If-Unmodified-Since', 'Max-Forwards', 'Origin', 'Pragma', 'Proxy-Authorization', 'Range', 'Referer', 'TE', 'Upgrade', 'User-Agent', 'Via', 'Warning' ] FIELDS = [] for a in FIELDSs: FIELDS.append(a.lower()) with open('arcCSP.har', 'r') as f: data = HarParser(json.loads(f.read())) for page in data.pages: toprint = "" toprint = toprint + "=========================\n" + str(page) print(toprint) for entry in page.entries: tab = entry['request']['headers'] toprinta = "" toprinta = toprinta + entry['request']['url'] + "\n" + entry[ 'request']['httpVersion'] + "\n" #print(entry['request']['url']) #print(entry['request']['httpVersion']) #print(' ') i = 0 for aa in tab:
url = "https://www.instagram.com/p/%s/" % shortcode ts = int(time) utc = datetime.utcfromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S') g.write("%s,%s,%s,%s,%s,%s\n" % (shortcode, url, time, utc, likes, comments)) except Exception as e: #print(e) pass #return shortcode_list2 if __name__ == "__main__": with open(sys.argv[1], 'rb') as f: har = f.read() har_parser = HarParser(json.loads(har)) har_page = HarPage('page_4', har_data=json.loads(har)) x = len(har_page.entries) for i in range(0, x): resource_type = har_page.entries[i]['_resourceType'] #print(resource_type) req_url = har_page.entries[i]['request']['url'] if req_url == "https://www.instagram.com/katyperry/": #First 12 posts res = har_page.entries[0]['response']['content']['text'] #print(res) first_12_posts = get_shortcode_first(res) elif resource_type == "xhr" and req_url.startswith( "https://www.instagram.com/graphql/query/?query_hash="): #for other posts res = har_page.entries[i]['response']['content']['text']