def test_get_path_2nd_child(self): doc = '\n'.join([ '<html>', ' <body>', ' <p></p>', ' <p></p>', ' </body>', '</html>', ]) soup = HtmlDocument(doc) p = soup.find_all('p')[1] selector = get_css_selector(p) self.assertEqual(selector, 'HTML > BODY:nth-of-type(1) > P:nth-of-type(2)')
def test_do_not_count_children_of_different_type(self): doc = '\n'.join([ '<html>', ' <body>', ' <p></p>', ' <ul></ul>', # this is not a <p> element ' <p></p>', ' </body>', '</html>', ]) soup = HtmlDocument(doc) p = soup.find_all('p')[1] selector = get_css_selector(p) self.assertEqual(selector, 'HTML > BODY:nth-of-type(1) > P:nth-of-type(2)')
def test_extract_command_with_variables(self): extractor = CommandExtractor('wget') node = HtmlDocument('<code>VAR=val wget http://google.com</code>') regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 29)
def test_single_redirect_at_start_of_line_is_ignored(self): extractor = CommandExtractor('sed') node = HtmlDocument("<code>>sed 's/patt/repl/' file.txt</code>") regions = extractor.extract(node) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.start_offset, 1)
def test_extract_includes_redirect(self): extractor = CommandExtractor('wget') node = HtmlDocument("<code>wget google.com > /dev/null 2>&1</code>") regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 31)
def wrapper(request): text = request.POST.get('text') client_start_time = request.POST.get('client_start_time') edge_size = int(request.POST.get('edge_size', 0)) db_logger = DbLogger() query_record = db_logger.log_query(request) region = Region(HtmlDocument(text), 0, len(text) - 1, text) explanation = explain_func(text, edge_size) region_record = db_logger.log_region(request, query_record, region) explained_region = _package_region(region, explanation, region_record.id, query_record.id) # Update the runtime of the scan db_logger.update_server_end_time(query_record) return HttpResponse( json.dumps( { "region": explained_region, "url": _get_resource_url(request, "/api/v1/client_query/"), "sq_id": query_record.id, "client_start_time": client_start_time, "error": 0 }, indent=2))
def test_extract_from_crontab(self): extractor = CommandExtractor('wget') node = HtmlDocument("<code>*/5 * * * * wget mysite.com</code>") regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 12) self.assertEqual(r.end_offset, 26)
def test_detect_in_crontab_entry(self): extractor = WgetExtractor() regions = extractor.extract( HtmlDocument('<code>*/5 * * * * wget google.com</code>')) r = regions[0] self.assertEqual(r.start_offset, 12) self.assertEqual(r.end_offset, 26)
def test_extract_multiple(self): regions = self.extractor.extract(HtmlDocument("\n".join([ "<code>", " $('p').text('hello');", " var input = $('input.klazz');", "</code", ]))) self.assertEqual(len(regions), 2)
def test_handle_args_inside_carats(self): extractor = WgetExtractor() regions = extractor.extract( HtmlDocument('<code>wget -A<ext> <URL></code>')) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 17)
def test_extract_empty_lines(self): node = HtmlDocument('\n'.join([ '<code>', ' First line', '</code>', ])).code regions = self.extractor.extract(node) self.assertEqual(len(regions), 3)
def test_extract_command_by_regex(self): extractor = CommandExtractor('wget(\.exe)?') node = HtmlDocument( '<code>my-shell$ wget.exe http://google.com</code>') regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 10) self.assertEqual(r.end_offset, 35)
def test_skip_regex_with_repeated_flags(self): node = HtmlDocument('\n'.join([ '<code>', "var pattern = /regular-expression/gg;", '</code>', ])) regions = self.extractor.extract(node) self.assertEqual(len(regions), 0)
def test_skip_code_that_doesnt_pass_javascript_parser(self): node = HtmlDocument('\n'.join([ '<code>', "<>/regex/;", '</code>', ])) regions = self.extractor.extract(node) self.assertEqual(len(regions), 0)
def test_ignore_addresses_that_arent_regex(self): node = HtmlDocument('\n'.join([ '<code>', 'sed "0,1p" file', '</code>', ])) regions = self.extractor.extract(node) self.assertEqual(len(regions), 0)
def test_handle_find_pattern_with_character_class(self): ''' This test case failed earlier as we performed a regex search with the pattern found against the original command, and it was being interpreted as a regex, and not a raw string. ''' node = HtmlDocument('<code>sed "s/[A-Z]bc//g" file.txt</code>') regions = self.extractor.extract(node) self.assertEqual(len(regions), 1)
def test_skip_if_more_than_one_url_and_no_args(self): ''' We suspect that if there are multiple "URLs" but no args, then this command is likely part of a prose sentence and not a command invokation. ''' extractor = WgetExtractor() regions = extractor.extract( HtmlDocument('<code>wget url1 url2</code>')) self.assertEqual(len(regions), 0)
def test_document_is_the_same_as_passed_in(self): node = HtmlDocument('<div><p>hello</p></div>') extractor = HelloTextExtractor() scanner = NodeScanner(extractor, ['p']) regions = scanner.scan(node) r = regions[0] self.assertEqual( str(r.node.parent.parent.parent), '<html><head></head><body><div><p>hello</p></div></body></html>')
def test_extract_selector(self): node = HtmlDocument("<code>$('p').text('hello');</code>") regions = self.extractor.extract(node) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.node, node) self.assertEqual(r.start_offset, 3) self.assertEqual(r.end_offset, 3) self.assertEqual(r.string, 'p')
def test_count_newlines_as_chars(self): node = HtmlDocument('\n'.join([ '<code>', ' First line', '</code>', ])).code regions = self.extractor.extract(node) r = regions[1] self.assertEqual(r.start_offset, 1) self.assertEqual(r.end_offset, 14)
def test_extract_pattern_from_option(self): node = HtmlDocument('\n'.join([ '<code>', "grep -e pattern *", '</code>', ])) regions = self.extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 9) self.assertEqual(r.end_offset, 15)
def test_extract_first_line(self): node = HtmlDocument('\n'.join([ '<code> First line</code>', ])).code regions = self.extractor.extract(node) r = regions[0] self.assertEqual(r.node, node) self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 13) self.assertEqual(r.string, " First line")
def test_allow_whitespace_before_directive(self): node = HtmlDocument('\n'.join([ "<code>", " RewriteCond %{HTTP_USER_AGENT} ^Mozilla", "</code>", ])) regions = self.extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 38) self.assertEqual(r.end_offset, 45)
def test_case_insensitive_directive_detected(self): node = HtmlDocument('\n'.join([ "<code>", "REWRITEcOnD %{HTTP_USER_AGENT} ^Mozilla", "</code>", ])) regions = self.extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 34) self.assertEqual(r.end_offset, 41)
def test_skip_blank_regions_but_keep_offset(self): extractor = WgetExtractor() regions = extractor.extract( HtmlDocument('\n'.join([ '<code>#!/bin/sh<br>', 'wget http://google.com', ]))) r = regions[0] self.assertEqual(r.start_offset, 10) self.assertEqual(r.end_offset, 31)
def test_extract_pattern_containing_spaces(self): node = HtmlDocument('\n'.join([ '<code>', "grep 'Pattern with spaces' *", '</code>', ])) regions = self.extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 7) self.assertEqual(r.end_offset, 25)
def test_extract_command(self): extractor = CommandExtractor('wget') node = HtmlDocument('<code>wget http://google.com</code>') regions = extractor.extract(node) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.node, node) self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 21) self.assertEqual(r.string, 'wget http://google.com')
def test_extract_multiple_commands(self): extractor = CommandExtractor('wget(\.exe)?') node = HtmlDocument('\n'.join([ '<code>', ' wget http://google.com', ' wget http://gaggle.com', '</code>', ])) regions = extractor.extract(node) self.assertEqual(len(regions), 2)
def test_extract_second_line(self): node = HtmlDocument('\n'.join([ '<code> First line', ' Second line</code>', ])).code regions = self.extractor.extract(node) r2 = regions[1] self.assertEqual(r2.node, node) self.assertEqual(r2.start_offset, 15) self.assertEqual(r2.end_offset, 29) self.assertEqual(r2.string, " Second line")
def test_detect_in_bash_script_with_leading_comments(self): extractor = WgetExtractor() regions = extractor.extract( HtmlDocument('\n'.join([ "# comment", "wget http://gaggle.com</code>", ]))) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.start_offset, 10) self.assertEqual(r.end_offset, 31)
def test_get_path_multiple_levels_nth_child(self): doc = '\n'.join([ '<html>', ' <body>', ' <div>', ' <p></p>', ' </div>', ' <div>' ' <p></p>', ' <p></p>', ' <p></p>', ' </div>', ' <div></div>', ' </body>', '</html>', ]) soup = HtmlDocument(doc) p = soup.find_all('div')[1].find_all('p')[2] selector = get_css_selector(p) self.assertEqual( selector, 'HTML > BODY:nth-of-type(1) > ' + 'DIV:nth-of-type(2) > P:nth-of-type(3)')