def first_run(): """ first time run. prepare data structures, compile patterns and collect text_chunks from URL """ request = requests.get(SETTINGS["url_check"]) # collect all regext chunks for selector in SETTINGS["regexp_selectors"]: parser = RegexParser(selector["selector"]) chunk = parser.search(request.text) TEXT_CHUNKS[selector["selector"]] = { "name": selector["name"], "parser": parser, "text_chunk": chunk } # collect all CSS chunks for selector in SETTINGS["css_selectors"]: parser = CSSParser(selector["selector"]) chunk = parser.search(request.text) TEXT_CHUNKS[selector["selector"]] = { "name": selector["name"], "parser": parser, "text_chunk": chunk } print "first run complete, now wait"
async def collect(self): url = "http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp=" # send a request to get html code of the page html = await http_client.get_text(url) # and just parse it using regex parser with a default rule to parse # proxies like this: # 8.8.8.8:8080 return RegexParser().parse(html)
def test_execute_dict_with_parse_field(self): self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this") m1 = self.parser.execute({"parse_this": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"}) self.assertEqual(m1["type"], "irclog") self.assertEqual(m1["team"], "team1") self.assertEqual(m1["user"], "jbruce") self.assertEqual(m1["date"], "2012-02-27") self.assertEqual(m1["msg"], "testing123") self.assertEqual(m1["other"], "should be retained")
def setUp(self): self.regs = { "yyyy-mm-dd": "(?P<date>\d{4}-\d{2}-\d{2})", "user": "******", "team": "(?P<team>\w+)", "word": "(?P<word>\w+)", "msg": "(?P<msg>.*)", "irclog": "{{team}}\|{{yyyy-mm-dd}} \< {{user}}\> {{msg}}", } self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"]) self.pattern_type = type(re.compile("\d{2}"))
kwargs = dict(x.split('=', 1) for x in sys.argv[1:]) bulk = kwargs.pop("bulk", None) urllink = kwargs.pop( "url", "http://helsinginsuunnistajat.fi/assets/Kivikko12.10.2014_rastit.txt") verbose = kwargs.pop("verbose", "False") == "True" title = kwargs.pop("title", "") place = kwargs.pop("place", "") date = kwargs.pop("date", "1.1.1982") parser_ = kwargs.pop("parser", "recognize") if parser_ == "regex": urlParser = HTTPParser, parser = RegexParser(url=urllink, verbose=verbose, date=date, place=place, header=urllink, title=title) elif parser_ == "regex2": urlParser = HTTPParser, parser = RegexParser2(url=urllink, verbose=verbose, date=date, place=place, header=urllink, title=title) else: print(urllink) urlParser, parser = recognizeParser(urllink, bulk=bulk, verbose=verbose,
async def collect(self): url = "https://getfreeproxylists.blogspot.com/" html = await http_client.get_text(url=url) return RegexParser().parse(html)
state.parse_state(state_file) output.parse_outputs(output_file) args = parser.parse_args() observer = LogObserver(state_file) for fl in config.get_files(): pos = state.pos(fl) inode, dev = state.id(fl) filters = config.get_filter(fl) name = config.get_name(fl) retention = config.get_retention(fl) out = output.get_output(config.get_output(fl)) res = [] for x in filters: res.append(RegexParser(x['regex'], x['emit'], x['transform'])) observer.add(fl, pos, res, inode, dev, out, name, retention) observer.start() try: while True: observer.dump_state() observer.flush_output() time.sleep(STATE_DUMP_TIMEOUT) finally: logging.debug('finale') # print(event_handler1.dump_state()) observer.stop() observer.join() observer.dump_state()
class RegexParserTestCase(TestCase): def setUp(self): self.regs = { "yyyy-mm-dd": "(?P<date>\d{4}-\d{2}-\d{2})", "user": "******", "team": "(?P<team>\w+)", "word": "(?P<word>\w+)", "msg": "(?P<msg>.*)", "irclog": "{{team}}\|{{yyyy-mm-dd}} \< {{user}}\> {{msg}}", } self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"]) self.pattern_type = type(re.compile("\d{2}")) def test_compile(self): for key in self.parser.use: self.assertIsInstance(self.parser.compiled[key], self.pattern_type) def test_bad_use_key(self): self.assertRaises(Exception, RegexParser, regs=self.regs, use=["notakey"]) def test_use_not_empty(self): self.assertRaises(Exception, RegexParser, self.regs, use=[]) def test_search(self): m1 = self.parser.execute("team1|2012-02-27 < jbruce> testing123") self.assertEqual(m1["type"], "irclog") self.assertEqual(m1["team"], "team1") self.assertEqual(m1["user"], "jbruce") self.assertEqual(m1["date"], "2012-02-27") self.assertEqual(m1["msg"], "testing123") def test_search_cascade(self): """ note this also tests that the first match wins. both team and word will match, but team matches first. """ m1 = self.parser.execute("this is a test") self.assertEqual(m1["type"], "team") self.assertEqual(m1["team"], "this") def test_search_no_match(self): m1 = self.parser.execute(".%$#@.*") self.assertIsNone(m1) m1 = self.parser.execute("") self.assertIsNone(m1) def test_execute_dict(self): m1 = self.parser.execute({"msg": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"}) self.assertEqual(m1["type"], "irclog") self.assertEqual(m1["team"], "team1") self.assertEqual(m1["user"], "jbruce") self.assertEqual(m1["date"], "2012-02-27") self.assertEqual(m1["msg"], "testing123") self.assertEqual(m1["other"], "should be retained") def test_execute_dict_with_parse_field(self): self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this") m1 = self.parser.execute({"parse_this": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"}) self.assertEqual(m1["type"], "irclog") self.assertEqual(m1["team"], "team1") self.assertEqual(m1["user"], "jbruce") self.assertEqual(m1["date"], "2012-02-27") self.assertEqual(m1["msg"], "testing123") self.assertEqual(m1["other"], "should be retained") def test_execute_dict_with_parse_field_not_in_data(self): self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this") m1 = self.parser.execute({"blah": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"}) self.assertIsNone(m1)
def test_execute_dict_with_parse_field_not_in_data(self): self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this") m1 = self.parser.execute({"blah": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"}) self.assertIsNone(m1)