def generate(strategy, source, target, time, test, name): nodes = {} #Cerco la strategia di mining richiesta strategyConfig = Parser.searchMinerStrategy(strategy) #Eseguo quella statica se presente if 'static' in strategyConfig: print('Executing static mining...') StaticMinerContext.doStaticMining(strategyConfig['static']['class'], source, strategyConfig['static']['args'] if 'args' in strategyConfig['static'] else {}, nodes) #Eseguo quella dinamica se presente if 'dynamic' in strategyConfig: print('Executing dynamic mining...') if 'args' in strategyConfig['dynamic']: strategyConfig['dynamic']['args']['time'] = time strategyConfig['dynamic']['args']['test'] = test DynamicMinerContext.doDynamicMining(strategyConfig['dynamic']['class'], source, strategyConfig['dynamic']['args'], nodes) else: strategyConfig['dynamic']['args'] = {'time': time} strategyConfig['dynamic']['args'] = {'test': test} DynamicMinerContext.doDynamicMining(strategyConfig['dynamic']['class'], source, strategyConfig['dynamic']['args'], nodes) #Carico ed eseguo tutte le strategie di refinement refinerStrategies = Parser.getRefinerStrategies() if refinerStrategies: print('Executing Refinement...') RefinerContext.doRefinement(refinerStrategies, nodes) #Esporto il microTOSCA print('Exporting microTOSCA...') YMLExporter.export(nodes, target, name)
def extract(self, source): """Extract an image from *source*. If the image is supported an instance of PIL's Image is returned, otherwise None. """ p = Parser() f = open_pds(source) if self.log: self.log.debug("Parsing '%s'" % (source)) self.labels = p.parse(f) if self.log: self.log.debug("Found %d labels" % (len(self.labels))) if self._check_image_is_supported(): if self.log: self.log.debug("Image in '%s' is supported" % (source)) dim = self._get_image_dimensions() loc = self._get_image_location() imageSampleBits = int(self.labels['IMAGE']['SAMPLE_BITS']) imageSampleType = self.labels['IMAGE']['SAMPLE_TYPE'] md5Checksum = self._get_image_checksum() if self.log: self.log.debug("Image dimensions should be %s" % (str(dim))) if self.log: self.log.debug("Seeking to image data at %d" % (loc)) f.seek(loc) if imageSampleBits == 8: readSize = dim[0] * dim[1] elif imageSampleBits == 16: readSize = dim[0] * dim[1] * 2 print readSize if self.log: self.log.debug("Seek successful, reading data (%s)" % (readSize)) # rawImageData = f.readline() # f.seek(-int(self.labels["RECORD_BYTES"]), os.SEEK_CUR) rawImageData = f.read(readSize) if md5Checksum: rawImageChecksum = hashlib.md5(rawImageData).hexdigest() checksumVerificationPassed = rawImageChecksum == md5Checksum and True or False if not checksumVerificationPassed: if self.log: self.log.debug("Secure hash verification failed") if self.raisesChecksumError: errorMessage = "Verification failed! Expected '%s' but got '%s'." % (md5Checksum, rawImageChecksum) raise ChecksumError, errorMessage else: if self.log: self.log.debug("Secure hash verification passed") if self.log: self.log.debug("Read successful (len: %d), creating Image object" % (len(rawImageData))) # The frombuffer defaults may change in a future release; # for portability, change the call to read: # frombuffer(mode, size, data, 'raw', mode, 0, 1). if (imageSampleBits == 16) and imageSampleType == ('MSB_INTEGER'): #img = Image.frombuffer('I', dim, rawImageData, 'raw', 'I;16BS', 0, 1) img = Image.frombuffer('F', dim, rawImageData, 'raw', 'F;16B', 0, 1) img = ImageMath.eval("convert(a/16.0, 'L')", a=img) else: img = Image.frombuffer('L', dim, rawImageData, 'raw', 'L', 0, 1) if self.log: self.log.debug("Image result: %s" % (str(img))) self.log.debug("Image info: %s" % (str(img.info))) self.log.debug("Image mode: %s" % (str(img.mode))) self.log.debug("Image size: %s" % (str(img.size))) else: if self.log: self.log.error("Image is not supported '%s'" % (source)) img = None f.close() return img, self.labels
def extract(self, source): """Extract an image from *source*. If the image is supported an instance of PIL's Image is returned, otherwise None. """ p = Parser() f = open_pds(source) if self.log: self.log.debug("Parsing '%s'" % (source)) self.labels = p.parse(f) if self.log: self.log.debug("Found %d labels" % (len(self.labels))) if self._check_image_is_supported(): if self.log: self.log.debug("Image in '%s' is supported" % (source)) dim = self._get_image_dimensions() loc = self._get_image_location() if self.log: self.log.debug("Image dimensions should be %s" % (str(dim))) if self.log: self.log.debug("Seeking to image data at %d" % (loc)) f.seek(loc) if self.log: self.log.debug("Seek successful, reading data") # rawImageData = f.readline() # f.seek(-int(self.labels["RECORD_BYTES"]), os.SEEK_CUR) rawImageData = f.read(dim[0] * dim[1]) if self.log: self.log.debug("Read successful (len: %d), creating Image object" % (len(rawImageData))) # The frombuffer defaults may change in a future release; # for portability, change the call to read: # frombuffer(mode, size, data, 'raw', mode, 0, 1). img = Image.frombuffer('L', dim, rawImageData, 'raw', 'L', 0, 1) if self.log: self.log.debug("Image result: %s" % (str(img))) if self.log: self.log.debug("Image info: %s" % (str(img.info))) if self.log: self.log.debug("Image size: %s" % (str(img.size))) else: if self.log: self.log.error("Image is not supported '%s'" % (source)) img = None f.close() return img, self.labels
def __init__(self, domain, display=None): self.domain = domain self.display = display self.results = "" self.user_agent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" self.p = ProgressBar(display=self.display) self.gather() self.parser = Parser(self.results, self.domain)
def invitees(): strategy = DistanceStrategy(GLOBAL_CIRCLE_DISTANCE) service = InviteService(distance_estimator=strategy) file_path = './test_customer.json' parser = Parser() data = parser.parsing(file_path=file_path, decoder=CustomerDecoder) result = service.calculate(data) yield result
def init(self): parser = Parser(self.file_path) parser.parse() grid_factory = GridFactory(parser.get_parsed_data()) grid_factory.create_grid() self.grid = grid_factory.get_grid() self.constraint_service = ConstraintService(self.grid)
def input_cmd(self): if self.recv_count < len(self.peer_address_list): return parser = Parser() cmd_str = input("MiniDFS> ") cmd_str = "put ptb.wrd" parser.judge_cmd(cmd_str) self.request_buffer = parser.data self.recv_count = 0
def test_resolvePath_srcAttrElements_returnFullPath(self): html = '<img src="/testing"/>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, f'<img src="{config.HOST}/testing"/>') html = '<iframe src="/testing"></iframe>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, f'<iframe src="{config.HOST}/testing"></iframe>')
def test_resolvePath_hrefAttrElements_returnFullPath(self): html = '<a href="/testing"></a>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, f'<a href="{config.HOST}/testing"></a>') html = '<link href="/testing"/>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, f'<link href="{config.HOST}/testing"/>')
def test_resolvePath_attrMissing_returnOrigin(self): html = '<img/>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, html) html = '<a></a>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, html)
def add_primitives(self, bindings): return self.update( { k: Primitive( k, p[0], Parser.parse_type(Parser.string_to_sexpr(p[1])), self ) for k, p in bindings.items() } )
def test_parse_args(self): args_str = "([x: (Num -> Str)] [y: (List Num)] z)" args = Parser.parse_args(Parser.string_to_sexpr(args_str)) self.assertEqual(len(args), 3) self.assertEqual(args[0].type, FunType([NumType()], StringType())) self.assertEqual(args[0].identifier, "x") self.assertEqual(args[1].type, ListType(NumType())) self.assertEqual(args[1].identifier, "y") self.assertEqual(args[2].type, DynamicType()) self.assertEqual(args[2].identifier, "z")
class Gather(): def __init__(self, domain, display=None): self.domain = domain self.display = display self.results = "" self.user_agent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" self.p = ProgressBar(display=self.display) self.gather() self.parser = Parser(self.results, self.domain) def hosts(self): return self.parser.hosts() def emails(self): return self.parser.emails() @staticmethod def get_sources(): return "Currently searching [google, bing, ask, dogpile, yandex, baidu, yahoo, duckduckgo]" def search(self, url, offset=1, maxoffset=0, title=""): current_offset = 0 data = "" self.p.reset(title=title) while current_offset <= maxoffset: self.p.rotate() temp_url = re.sub(r'\[\[OFFSET\]\]', str(current_offset), url) try: headers = { 'User-Agent' : self.user_agent } req = urllib2.Request(temp_url, None, headers) data += urllib2.urlopen(req).read() except urllib2.URLError as e: self.display.error("Could not access [%s]" % (title)) return data except Exception as e: print e current_offset += offset self.p.done() return data def gather(self, maxoffset=500): self.results += self.search(title="Google", url="http://www.google.com/search?num=100&start=[[OFFSET]]&hl=en&meta=&q=%40\"" + self.domain + "\"", offset=100, maxoffset=maxoffset) self.results += self.search(title="Bing", url="http://www.bing.com/search?q=%40" + self.domain + "&count=50&first=[[OFFSET]]", offset=50, maxoffset=maxoffset) self.results += self.search(title="Ask", url="http://www.ask.com/web?q=%40" + self.domain + "&pu=100&page=[[OFFSET]]", offset=100, maxoffset=maxoffset) self.results += self.search(title="Dogpile", url="http://www.dogpile.com/search/web?qsi=[[OFFSET]]&q=\"%40" + self.domain + "\"", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="Yandex", url="http://www.yandex.com/search?text=%40" + self.domain + "&numdoc=50&lr=[[OFFSET]]", offset=50, maxoffset=maxoffset) self.results += self.search(title="Baidu", url="http://www.baidu.com/s?wd=%40" + self.domain + "&pn=[[OFFSET]]", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="Yahoo", url="https://search.yahoo.com/search?p=\"%40" + self.domain + "\"&b=[[OFFSET]]&pz=10", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="DuckDuckGo", url="https://duckduckgo.com/lite?q=\"%40" + self.domain + "\"" )
class Gather(): def __init__(self, domain, display=None): self.domain = domain self.display = display self.results = "" self.user_agent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" self.p = ProgressBar(display=self.display) self.gather() self.parser = Parser(self.results, self.domain) def hosts(self): return self.parser.hosts() def emails(self): return self.parser.emails() @staticmethod def get_sources(): return "Currently searching [google, bing, ask, dogpile, yandex, baidu, yahoo, duckduckgo]" def search(self, url, offset=1, maxoffset=0, title=""): current_offset = 0 data = "" self.p.reset(title=title) while current_offset <= maxoffset: self.p.rotate() temp_url = re.sub(r'\[\[OFFSET\]\]', str(current_offset), url) try: headers = { 'User-Agent' : self.user_agent } req = urllib.request.Request(str(temp_url), None, headers) data += str(urllib.request.urlopen(req).read()) except urllib.error.URLError as e: self.display.error("Could not access [%s]" % (title)) return data except Exception as e: print(e) current_offset += offset self.p.done() return data def gather(self, maxoffset=500): self.results += self.search(title="Google", url="http://www.google.com/search?num=100&start=[[OFFSET]]&hl=en&meta=&q=%40\"" + self.domain + "\"", offset=100, maxoffset=maxoffset) self.results += self.search(title="Bing", url="http://www.bing.com/search?q=%40" + self.domain + "&count=50&first=[[OFFSET]]", offset=50, maxoffset=maxoffset) self.results += self.search(title="Ask", url="http://www.ask.com/web?q=%40" + self.domain + "&pu=100&page=[[OFFSET]]", offset=100, maxoffset=maxoffset) self.results += self.search(title="Dogpile", url="http://www.dogpile.com/search/web?qsi=[[OFFSET]]&q=\"%40" + self.domain + "\"", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="Yandex", url="http://www.yandex.com/search?text=%40" + self.domain + "&numdoc=50&lr=[[OFFSET]]", offset=50, maxoffset=maxoffset) self.results += self.search(title="Baidu", url="http://www.baidu.com/s?wd=%40" + self.domain + "&pn=[[OFFSET]]", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="Yahoo", url="https://search.yahoo.com/search?p=\"%40" + self.domain + "\"&b=[[OFFSET]]&pz=10", offset=10, maxoffset=maxoffset/10) self.results += self.search(title="DuckDuckGo", url="https://duckduckgo.com/lite?q=\"%40" + self.domain + "\"" )
def main(): start, dest = process_args() parser = Parser(start, dest) try: routes = parser.get_routes() except InvalidInput as e: print(e) return 1 except NotFound as e: print(e) return 1 output = ConstructXML() output.print_xml(routes)
def main(): strategy = DistanceStrategy(GLOBAL_CIRCLE_DISTANCE) file_path = os.path.abspath(FILE_NAME) service = InviteService(distance_estimator=strategy) data = Parser.parsing(file_path=file_path, decoder=CustomerDecoder) result = service.calculate(data) FormatterOutput.output(result)
def render(self, tutorial): """ Render a single HTML document of tutorial Args: tutorial (document.Tutorial): tutorial object Returns: (str): Html document """ if not isinstance(tutorial, Tutorial): return doc = Renderer.render(document=tutorial) doc = Parser.filter(doc) doc = Parser.resolve_path(doc, config.HOST) return doc
def execute(cls, entrypoint, dest, ext=config.DOCEXTS[0], debug=False): """ Factory Method """ err = 0 try: ttp = PyTTP() print(f'- Parsing the entry point: {entrypoint}') tutorial = ttp.parse(entrypoint) print(f'- Extracting content from host for {tutorial}') urls = Parser.extract_href(tutorial.table_contents) ttp.extract(tutorial, urls[:2]) print(f'- Rendering html') html = ttp.render(tutorial) print(f'- Writting ({ext}) document on disk') ttp.write(filename=tutorial.name, data=html, dest=dest, ext=ext) except HostNameError as e: err = 1 print('error:', e) except EntryPointError as e: err = 1 print(f'error:{entrypoint} is not a valid entry point') except NotADirectoryError as e: err = 1 print('error:', e) except FileTypeError as e: err = 1 print('error:', e) finally: return err
def start(self): try: self.wait_for_start() self._istart = True """load seed """ self.load_seeds() #load seeds from google search """show welcome info""" self.show_welcome() self._status._sys_start = time() """start threads""" self._downloader = Downloader( self._config._down_num, self._status) self._downloader.start() self._parser = Parser(self._config._parser_num, self._status ) self._parser.start() self._downloader_pool_checker.start() self._parse_pool_checker.start() self._status_update.start() """notify mysql, i am started""" self.sqlex.write_if_start() except (Exception) as e: Log().debug("start failed") raise(e) return False
def eval(self, code_string): try: ast = Parser.parse(code_string) return self.interpret(ast) except Exception as e: name = e.__class__.__name__ message = e.args[0] return '{0}: {1}'.format(name, message)
def test_extractHref_anchorTagHrefAttrMissing_returnEmptyList(self): html = '''<html> <a>link1</a> <a>link2</a> </html>''' res = Parser.extract_href(html) print(res) self.assertEqual(res, [])
def prepare(self): """ preparation/initialization of opts and env: parsing & checks """ # declare nullscan options self.opt = Option(sys.argv) # check argc and argc (usage) self.check.check_argc(len(sys.argv)) self.check.check_argv(sys.argv) # check for missing libraries / deps / python modules self.check.check_deps(self.file.read_file(PYDEPS)) # parse cmdline and config options, update final options dictionary try: self.parser = Parser(self.opt.opts) self.parser.parse_cmdline() self.parser.parse_config() self.opt.opts = self.parser.opts except: self.log('usage', _type='err', end='\n') # update final options dictionary self.opt.update_opts() # further checks for usage, options, env, etc. self.check.check_opts(self.opt.opts) # collect all py-files and grep the tools out of the py-files tools = [] py_files = self.misc.find_py_files(MOD_PATH) for py in py_files: tools.append(self.misc.grep_tools(py)) tools = [x for sublist in tools for x in sublist] # create the locks for each tool except for excluded ones with ThreadPoolExecutor(50) as exe: for tool in tools: if tool not in self.opt.opts['tools']['ex_tools']: exe.submit(self.file.create_lock, tool) # copy debug flag to target_opts (for nullscan tools) self.opt.opts['targets_opts']['debug'] = self.opt.opts['debug'] return
def extract(self, source): """Extract an image from *source*. If the image is supported an instance of PIL's Image is returned, otherwise None. """ p = Parser() f = open_pds(source) pdsdatadir, pdsfile = os.path.split(source) if self.log: self.log.debug("Parsing '%s'" % (source)) self.labels = p.parse(f) if self.log: self.log.debug("Found %d labels" % (len(self.labels))) if self._check_table_is_supported(): if self.log: self.log.debug("Table in '%s' is supported" % (source)) dim = self._get_table_dimensions() # Get the location of the table location = self._get_table_location().strip().replace("\"", "") #location = os.path.join(pdsdatadir,location) # Get the structure of the table from the pointer struct_fname = self._get_table_structure().strip().replace( "\"", "") structurefile = getPdsFileName(struct_fname, pdsdatadir) sp = ColumnParser() s = open_pds(structurefile) slabels = sp.parse(s) columns = [] for l in slabels: columns.append(l['COLUMN']['NAME'].strip().replace("\"", "")) if self.log: self.log.debug("Found %d columns" % (len(columns))) if self.labels['TABLE']['INTERCHANGE_FORMAT'] == 'ASCII': locationfile = getPdsFileName(location, pdsdatadir) tbl = csv.DictReader(open(locationfile), fieldnames=columns, delimiter=' ') else: if self.log: self.log.error("Table is not supported '%s'" % (source)) tbl = None f.close() return tbl, self.labels
def extract(self, source): """Extract an image from *source*. If the image is supported an instance of PIL's Image is returned, otherwise None. """ p = Parser() f = open_pds(source) pdsdatadir, pdsfile = os.path.split(source) if self.log: self.log.debug("Parsing '%s'" % (source)) self.labels = p.parse(f) if self.log: self.log.debug("Found %d labels" % (len(self.labels))) if self._check_table_is_supported(): if self.log: self.log.debug("Table in '%s' is supported" % (source)) dim = self._get_table_dimensions() # Get the location of the table location = self._get_table_location().strip().replace('"', "") # location = os.path.join(pdsdatadir,location) # Get the structure of the table from the pointer struct_fname = self._get_table_structure().strip().replace('"', "") structurefile = getPdsFileName(struct_fname, pdsdatadir) sp = ColumnParser() s = open_pds(structurefile) slabels = sp.parse(s) columns = [] for l in slabels: columns.append(l["COLUMN"]["NAME"].strip().replace('"', "")) if self.log: self.log.debug("Found %d columns" % (len(columns))) if self.labels["TABLE"]["INTERCHANGE_FORMAT"] == "ASCII": locationfile = getPdsFileName(location, pdsdatadir) tbl = csv.DictReader(open(locationfile), fieldnames=columns, delimiter=" ") else: if self.log: self.log.error("Table is not supported '%s'" % (source)) tbl = None f.close() return tbl, self.labels
def test_environment(self): runtime = Monito() bindings = { 'x': 4, 'hola': Monito.run('(max (list 1 3 2))'), '+': Primitive( '+', lambda x, y: x * y, Parser.parse_type(Parser.string_to_sexpr('Num Num -> Num')), runtime.environment ) } new_env = runtime.environment.new_environment(bindings) self.assertEqual(Monito.run('(- 3 x)', new_env), -1) self.assertEqual(Monito.run('(- 10 hola)', new_env), 7) self.assertEqual(Monito.run('(+ 2 3)', new_env), 6)
def run(self): """ Entry point for the program. :return: none """ self.logger.info("Fixed width file generator is starting ...") generator = Generator() generator.generate_fixed_width_file( self.config['FILE']['SPEC_FILE'], int(self.config['FILE']['NO_OF_RECORDS']), self.config['FILE']['FIXED_WIDTH_FILE']) self.logger.info("Fixed width file parser is starting ...") parser = Parser() parser.convert_fixed_width_to_csv( self.config['FILE']['SPEC_FILE'], self.config['FILE']['FIXED_WIDTH_FILE'], self.config['FILE']['CSV_FILE'], self.config['FILE']['DELIMITER'])
def parse(self, entrypoint): """ Parse the entry point Args: entrypoint (str): url of any readable tutorial from HOST """ if not is_valid_hostname(entrypoint): raise HostNameError(f'{entrypoint} is not a valid host name') meta = Parser.resolve_path( Parser.parse(url=entrypoint, section=Section.META), config.HOST) table_contents = Parser.resolve_path( Parser.parse(url=entrypoint, section=Section.TABLE_CONTENTS), config.HOST) name = self.__parse_tutorial_name(entrypoint) return Tutorial(name, meta, table_contents)
def test_parse_type(self): self.assertEqual(Parser.parse_type("Num"), NumType()) self.assertEqual(Parser.parse_type("Str"), StringType()) self.assertEqual(Parser.parse_type("Bool"), BoolType()) self.assertEqual(Parser.parse_type("Void"), UnitType()) self.assertEqual(Parser.parse_type("Dyn"), DynamicType()) self.assertEqual(Parser.parse_type(["Num", "->", "Num"]), FunType([NumType()], NumType())) self.assertEqual(Parser.parse_type(["->", "Void"]), FunType([], UnitType())) self.assertEqual( Parser.parse_type([["Str", "->", "Str"], "->", "Num"]), FunType([FunType([StringType()], StringType())], NumType()), )
def test_separate_sexpr_strings(self): code = """ (define x 3) (define f (x) (+ x 4)) (f x) """ sexpr_strings = Parser.separate_sexpr_strings(code) self.assertEqual(len(sexpr_strings), 3) self.assertEqual(sexpr_strings[0], "(define x 3)") self.assertEqual(sexpr_strings[1], "(define f (x) (+ x 4))") self.assertEqual(sexpr_strings[2], "(f x)")
def test_string_to_sexpr(self): self.assertEqual(Parser.string_to_sexpr("true"), True) sexpr = Parser.string_to_sexpr("(and true false)") self.assertEqual(sexpr, ["and", True, False]) sexpr = Parser.string_to_sexpr("(+ (- 3 2) (sum 1 2 3 4))") self.assertEqual(sexpr, ["+", ["-", 3, 2], ["sum", 1, 2, 3, 4]]) sexpr = Parser.string_to_sexpr('(f (g (h 2 3 4) "hola") i)') self.assertEqual(sexpr, ["f", ["g", ["h", 2, 3, 4], '"hola"'], "i"]) sexpr = Parser.string_to_sexpr( """ {local [ (a 3) (b 2) ] (f a b) } """ ) self.assertEqual(sexpr, ["local", [["a", 3], ["b", 2]], ["f", "a", "b"]])
def run(self): # verify that self.config["XXXXXXXXXX_path"] exists if (self.path): if (os.path.isfile(self.path)): # Start process process = self.run_command() else: return "ERROR: " + self.name + "_path does not point to a valid file" else: return "ERROR: " + self.name + "_path is not configured" self.results = self.load_results() self.parser = Parser(self.results, self.domain) self.cleanup() return None
def test_parse_arg(self): num_arg1 = Parser.string_to_sexpr("[x : Num]") num_arg2 = Parser.string_to_sexpr("[x: Num]") self.assertEqual(num_arg1, num_arg2) num_arg = Parser.parse_arg(num_arg1) self.assertEqual(num_arg.identifier, "x") self.assertEqual(num_arg.type, NumType()) string_arg = self.to_arg("[s: Str]") self.assertEqual(string_arg.identifier, "s") self.assertEqual(string_arg.type, StringType()) bool_arg = self.to_arg("[x: Bool]") self.assertEqual(bool_arg.identifier, "x") self.assertEqual(bool_arg.type, BoolType()) list_arg = self.to_arg("[l: (List Str)]") self.assertEqual(list_arg.identifier, "l") self.assertEqual(list_arg.type, ListType(StringType())) dynamic_arg = self.to_arg("d") self.assertEqual(dynamic_arg.identifier, "d") self.assertEqual(dynamic_arg.type, DynamicType()) fun_arg = self.to_arg("[f: (Str -> Num)]") self.assertEqual(fun_arg.identifier, "f") self.assertEqual(fun_arg.type, FunType([StringType()], NumType())) fun_arg = self.to_arg("[g: (Num Str -> Bool)]") self.assertEqual(fun_arg.identifier, "g") self.assertEqual(fun_arg.type, FunType([NumType(), StringType()], BoolType())) fun_arg = self.to_arg("[h: ((Num -> Num) -> (Str -> Num))]") self.assertEqual(fun_arg.identifier, "h") self.assertEqual(fun_arg.type, FunType([FunType([NumType()], NumType())], FunType([StringType()], NumType())))
class Gather(): def __init__(self, domain, display=None): self.domain = domain self.display = display self.results = "" self.user_agent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" self.p = ProgressBar(display=self.display) self.gather() self.parser = Parser(self.results, self.domain) def hosts(self): return self.parser.hosts() def emails(self): return self.parser.emails() @staticmethod def get_sources(): return "Currently searching [google, bing, ask, dogpile, yandex, baidu, yahoo, duckduckgo]" def search(self, url, offset=1, maxoffset=0, title=""): current_offset = 0 data = "" self.p.reset(title=title) while current_offset <= maxoffset: self.p.rotate() temp_url = re.sub(r'\[\[OFFSET\]\]', str(current_offset), url) try: headers = { 'User-Agent' : self.user_agent } req = urllib2.Request(temp_url, None, headers) data += urllib2.urlopen(req).read() except Exception, e: print e current_offset += offset self.p.done() return data
def extract(self, tutorial, urls=[], trace=True): """ Extracting content section from each given url Args: tutorial (document.Tutorial): tutorial object urls (list): urls trace (boolean): print the current url that is being parse """ if not isinstance(tutorial, Tutorial): return for url in urls: if trace: print(f'\t. {url}....................') content = Parser.parse(url=url, section=Section.CONTENT) tutorial.contents.append(content)
def test_parse_function_with_types(self): definition = "(fun ([x: Num] [y: Num]) (+ x y))" fun_node = Parser.parse(definition) fun_args = fun_node.args self.assertEqual(len(fun_args), 2) self.assertTrue(isinstance(fun_args[0], Argument)) self.assertEqual(fun_args[0].type, NumType()) self.assertEqual(fun_args[0].identifier, "x") self.assertEqual(fun_args[1].type, NumType()) self.assertEqual(fun_args[1].identifier, "y") self.assertTrue(isinstance(fun_node.body, App))
def __init__( self): self._istart = False self._status = Status() """--- load config file----""" self._config = Configuration(); """--- core object ----""" self._downloader = None self._parser = Parser( self._config._down_num, self._status ) """--- memory models --- """ self._download_pool = SafeQueue() #Store the html objects to be downloaded by the downloader self._parse_pool = SafeQueue() #Store the html objects to be parsed by the parser """--- checker threads --- """ """The target is the function passed in to run in the thread. Those two threads keep checking and assigning jobs to the two thread pools""" self._downloader_pool_checker = Thread( target=self.download_pool_checker) self._parse_pool_checker = Thread( target=self.parse_pool_checker) """--- threads --- """ self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql """ ---strategies--- """ self._earlyvisithandler = EarlyVisitHandler() self._robothandler = RobotHandler() self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._omitindex = OmitIndex() self._urlextender = URLExtender() """ ---init the path for saving data, if the folder don't exist, create it ---""" self._path = self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/" if not os.path.exists(self._path): os.makedirs(self._path) self._config._down_path = self._path self._keywords_links= [] """ ---Mysql Manager--- """ self.sqlex = DatabseManager(self._config)
class TestParser(unittest.TestCase): parser = Parser() validator = Validator() def test_read_invalid_account_number_ill(self): valid_account_output = "12345678? ILL" calculated_account_number = self.parser.parse_an_account_number( data.INPUT123_INVALID) account_ill = self.validator.validate_account( calculated_account_number) self.assertEqual(account_ill.account_data, valid_account_output) def test_read_invalid_digits_ill_all(self): valid_account_output = "????????? ILL" calculated_account_number = self.parser.parse_an_account_number( data.INPUT_WITH_ILLS_ALL) account_ill = self.validator.validate_account( calculated_account_number) self.assertEqual(account_ill.account_data, valid_account_output) def test_read_checksumm_valid_one(self): valid_account_output = "000000051" calculated_account_number = self.parser.parse_an_account_number( data.INPUT_VALID_CHECKSUM1) account_ill = self.validator.validate_account( calculated_account_number) self.assertEqual(account_ill.account_data, valid_account_output) def test_read_checksumm_valid_two(self): valid_account_output = "345882865" calculated_account_number = self.parser.parse_an_account_number( data.INPUT_VALID_CHECKSUM2) account_ill = self.validator.validate_account( calculated_account_number) self.assertEqual(account_ill.account_data, valid_account_output) def test_read_checksumm_invalid(self): valid_account_output = "664371495 ERR" calculated_account_number = self.parser.parse_an_account_number( data.INPUT_INVALID_CHECKSUM) account_ill = self.validator.validate_account( calculated_account_number) self.assertEqual(account_ill.account_data, valid_account_output)
def start(self): """ do first needed things """ # init, usage, checks, etc. Help.banner() self.opts = vars(Parser.parseArgs()) c = Check(self.opts) c.checkArgc() c.checkArgs() c.checkInstallType() # run installer here if self.opts['type'] == 'text': t = TextInstaller(self.opts['verbose']) t.run() else: c = CursesInstaller(self.opts['verbose']) c.run() return
def repl(cls): print('Welcome to the Monito REPL\n') runtime = Monito() line_breaks = 0 code_input = '' while runtime.active: if line_breaks == 0: prompt = '>> ' else: prompt = '\t' code_input += cls.input(prompt) balanced, fail_index = Parser.balanced_parens(code_input) if not balanced and fail_index == len(code_input): line_breaks += 1 continue value = runtime.eval(code_input) if value is not None: print(value) line_breaks = 0 code_input = ''
def test_extractHref_anchorTag_returnList(self): html = '<html><a href="/link1"></a><a href="/link 2"></a></html>' res = Parser.extract_href(html) print(res) self.assertIsNotNone(res)
def test_extractHref_anchorTagMissing_returnEmptyList(self): html = '<html></html>' res = Parser.extract_href('') print(res) self.assertEqual(res, [])
def start(): Config.start() Parser.start()
def test_ast_generation(self): self.assertTrue(isinstance(Parser.parse("false"), Boolean)) self.assertTrue(isinstance(Parser.parse("2"), Number)) self.assertTrue(isinstance(Parser.parse("0.2"), Number))
def test_parseTableContents_validEntryPoint_returnStr(self): res = Parser.parse(url=config.ENTRYPOINT, section=Section.TABLE_CONTENTS) print('') self.assertIsInstance(res, str)
def test_resolvePath_emptyArg_returnOrigin(self): html = '' res = Parser.resolve_path(html, '') print(res) self.assertEqual(res, html)
class Engine(object): def __init__( self): self._istart = False self._status = Status() """--- load config file----""" self._config = Configuration(); """--- core object ----""" self._downloader = None self._parser = None """--- memory models --- """ self._download_pool = SafeQueue() #Store the html objects to be downloaded by the downloader self._parse_pool = SafeQueue() #Store the html objects to be parsed by the parser """--- checker threads --- """ """The target is the function passed in to run in the thread. Those two threads keep checking and assigning jobs to the two thread pools""" self._downloader_pool_checker = Thread( target=self.download_pool_checker) self._parse_pool_checker = Thread( target=self.parse_pool_checker) """--- threads --- """ self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql """ ---strategies--- """ self._earlyvisithandler = EarlyVisitHandler() self._robothandler = RobotHandler() self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._omitindex = OmitIndex() self._urlextender = URLExtender() """ ---init the path for saving data, if the folder don't exist, create it ---""" self._path = self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/" if not os.path.exists(self._path): os.makedirs(self._path) self._config._down_path = self._path self._keywords_links= [] """ ---Mysql Manager--- """ self.sqlex = DatabseManager(self._config) #self.f= open("data.txt", 'w') def load_seeds(self): #load seed info from config file #print "load_seeds 1" #load seed from contacter = SearchGoogle(self._config._keywords, self._config._result_num) self._keywords_links = contacter.getURLs() #append seeds, which from google search result, into download pool #print "load_seeds 2" #self._keywords_links.insert(0, "https://twitter.com/") #self._keywords_links.insert(0, "https://twitter.com/signup?context=login") i = 0 for url in self._keywords_links: if i < self._config._result_num: #print "@@{0}".format(url) html_task = Html(url) #print "@@1" if(self._schemehandler.SchemeChecker(html_task)==False): #print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@2" self._status._scheme+=1 continue if(self._bookmarkhandler.BookMarkChecker(html_task)==True): #print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@3" self._status._bookmark+=1 continue if(self._cgihandler.FindCGI(html_task)==True): #print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@4" self._status._cgi+=1 continue if(self._nestlevelhandler.checknestlevel(html_task,self._config._parser_nlv)==True): self._status._nestlv +=1 #print "@@5" #print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue if(self._filetypehandler.FileTypeChecker(html_task)==False): #print "@@6" self._status._file_type +=1 continue #print "@@7" ''' if(self._earlyvisithandler.check_visited(html_task) == True): self._status._early_visit +=1 #print("Ingore the link visited before, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue ''' self._omitindex.Omit(html_task) """ print "@@8" if(self._robothandler.is_allowed(html_task) == False): print "@@9" self._status._robot +=1 #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue print "@@10" """ self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) '''If use the following two line of code, then the program won't run, which means checking for revisit works''' '''however, the dic should be safe with a lock''' #self._visited_dic[html_task._md5] = html_task._url #print(len(self._visited_dic)) #print "@@11" else: break i+=1 #print "load_seeds 3" def show_welcome(self): print("download folder:"+self._path) print "key words:"+self._config._keywords print "donload thread num: {0}".format(self._config._down_num) print "parse thread num: {0}".format(self._config._parser_num) print "Load " +str(self._config._result_num)+" results from google search:" i = 0 for url in self._keywords_links: if i < self._config._result_num: print ("[{0}]".format(i)+url) i+=1 print "\n------------------------------------------------------------------------\n" #raw_input("press any key to start crawling, press second key to stop") def wait_for_start(self): print "ready for start....." print "go to http://dengxu.me/crawling/ to input some key words & see the result " while( self.sqlex.read_if_start(self._config)!= True): sleep(1) print "\n------------------------------------------------------------------------\n" print "starting crawling engine...." def start(self): try: self.wait_for_start() self._istart = True """load seed """ self.load_seeds() #load seeds from google search """show welcome info""" self.show_welcome() self._status._sys_start = time() """start threads""" self._downloader = Downloader( self._config._down_num, self._status) self._downloader.start() self._parser = Parser(self._config._parser_num, self._status ) self._parser.start() self._downloader_pool_checker.start() self._parse_pool_checker.start() self._status_update.start() """notify mysql, i am started""" self.sqlex.write_if_start() except (Exception) as e: Log().debug("start failed") raise(e) return False def stop(self): self._istart = False """"clear download and parse popl""" self._download_pool.clear() self._parse_pool.clear() """stop downloader and parser threads""" self._downloader.stop() self._parser.stop() """"Those two checker threads will end when the thread who calls them ends""" self._downloader_pool_checker.join() self._parse_pool_checker.join() self._status_update.join() print ("Engine is stopping") def pause(self): pass def finish_download(self, html_task): sentence = "Downloaded:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-code: {4} data-size: {5}byes url: {6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url ) #if self._status._download_times <= 500 : # self.f.write(sentence+"\n") """caculate the path for saving files""" full_path = self._path+"[No.{0}]_".format(self._status._download_times)+".html" """save html data to files""" #f= open(full_path, 'w') #f.write(html_task._data) #f.close() """After downloading, pass the data(still using the html objects) to the parse pool""" self._parse_pool.append(html_task) def finish_parse(self, html_task): ''' print("parsed:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-status: {4} data-size: {5}byes url:{6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url)) ''' """After parsing, pass the urls to be downloaded to the download pool""" if(self._earlyvisithandler.check_visited(html_task) == True): #print("Ingore the link visited before, this link is within page {0} , so don't put it in queue".format(html_task._parent), html_task._url) self._status._early_visit +=1 return if(self._robothandler.is_allowed(html_task) == False): #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._robot +=1 return self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) def download_pool_checker(self): while (self._istart == True): new_download_task = self._download_pool.pop_left() """If there is no task remain in the download pool, put the thread into sleep""" """else pop the new task, and download it""" """for the engine to get the result to put into the parse pool, we need to pass the function finish_download down as a callback""" if (new_download_task == None): #print("No task remaining in download_pool") sleep(0.1) else: self._downloader.queue_download_task(new_download_task , self.finish_download) def parse_pool_checker(self): while (self._istart == True): new_parse_task = self._parse_pool.pop_left() if (new_parse_task == None): #print("sleeping") sleep(0.1) else: self._parser.queue_parse_task(new_parse_task, self.finish_parse) #~~~see result at http://dengxu.me/crawling/ def status_update(self): while (self._istart == True): self._status._download_queue = self._downloader.len() self._status._parse_queue = self._parser.len() sentence = "[time: {0:0.1f}],queue:{8}, down: {1}, total: {2:0.1f}MB | queue:{9}, parsed: {3},scheme:{10}, cig: {4}, bookmark: {11} type {12} visited: {5}, robot: {6},nestlv: {7} | error: 404: {13} , timeout: {14}"\ .format( time()-self._status._sys_start,\ self._status._download_times, float(self._status._download_size)/1024/1024, self._status._parse_times\ ,self._status._cgi, self._status._early_visit, self._status._robot, self._status._nestlv\ ,self._downloader.len(), self._parser.len(),self._status._scheme_type, self._status._bookmark, self._status._file_type\ ,self._status._404,self._status._socket_timeout) print sentence #if( self._status._download_times > 500): # self.f.write( sentence+"\n") """update status tp mysql""" self.sqlex.write_status(self._status) """update recent download url""" self.sqlex.write_recent_download(self._status) sleep(1)
def test_resolvePath_emptyAttr_returnOrigin(self): html = '<a href=""></a>' res = Parser.resolve_path(html, config.HOST) print(res) self.assertEqual(res, html)
def test_balanced_parens(self): self.assertTrue(Parser.balanced_parens("()")[0]) self.assertFalse(Parser.balanced_parens("(")[0]) self.assertFalse(Parser.balanced_parens(")")[0]) self.assertTrue(Parser.balanced_parens("(a)")[0]) self.assertTrue(Parser.balanced_parens("[][]")[0]) self.assertTrue(Parser.balanced_parens("(a [b] (c {d}))")[0]) self.assertTrue(Parser.balanced_parens("(ab [c e (e) {a}] [d])")[0]) self.assertFalse(Parser.balanced_parens("(a))")[0]) self.assertFalse(Parser.balanced_parens("([][]}")[0]) self.assertFalse(Parser.balanced_parens("{[[]}")[0]) self.assertFalse(Parser.balanced_parens(")ab(")[0])
# Author: Abubakar Nur Khalil # License: MIT # Purpose: Appropriate parsed output from Parser from utils.tokens import TokenType from core.scanner import Scanner from core.parser import Parser from tools.custom_syntax import Scanner as _Virgil from tools.custom_syntax import Parser as _Dante # Remember we always need to generate the KSL first and pass it over KSL = _Dante(_Virgil('').scan()).parse() source = """ var name = "ank"; 77.67 * (8 // 2); """ print('Source code:') print(source) tks = Scanner(source, KSL[0]).scan() pr = Parser(tks, KSL[1]) print("\nFirst Token is variable (VAR):", pr.check(TokenType.VAR)) # True
def to_arg(self, arg_string): sexpr = Parser.string_to_sexpr(arg_string) return Parser.parse_arg(sexpr)