Exemplo n.º 1
0
def first_run():
    """
    first time run. prepare data structures, compile patterns and collect text_chunks from URL
    """

    request = requests.get(SETTINGS["url_check"])

    # collect all regext chunks
    for selector in SETTINGS["regexp_selectors"]:
        parser = RegexParser(selector["selector"])
        chunk = parser.search(request.text)
        TEXT_CHUNKS[selector["selector"]] = {
            "name": selector["name"],
            "parser": parser,
            "text_chunk": chunk
        }

    # collect all CSS chunks
    for selector in SETTINGS["css_selectors"]:
        parser = CSSParser(selector["selector"])
        chunk = parser.search(request.text)
        TEXT_CHUNKS[selector["selector"]] = {
            "name": selector["name"],
            "parser": parser,
            "text_chunk": chunk
        }

    print "first run complete, now wait"
Exemplo n.º 2
0
 async def collect(self):
     url = "http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp="
     # send a request to get html code of the page
     html = await http_client.get_text(url)
     # and just parse it using regex parser with a default rule to parse
     # proxies like this:
     # 8.8.8.8:8080
     return RegexParser().parse(html)
Exemplo n.º 3
0
 def test_execute_dict_with_parse_field(self):
     self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this")
     m1 = self.parser.execute({"parse_this": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"})
     self.assertEqual(m1["type"], "irclog")
     self.assertEqual(m1["team"], "team1")
     self.assertEqual(m1["user"], "jbruce")
     self.assertEqual(m1["date"], "2012-02-27")
     self.assertEqual(m1["msg"], "testing123")
     self.assertEqual(m1["other"], "should be retained")
Exemplo n.º 4
0
 def setUp(self):
     self.regs = {
         "yyyy-mm-dd": "(?P<date>\d{4}-\d{2}-\d{2})",
         "user": "******",
         "team": "(?P<team>\w+)",
         "word": "(?P<word>\w+)",
         "msg": "(?P<msg>.*)",
         "irclog": "{{team}}\|{{yyyy-mm-dd}} \< {{user}}\> {{msg}}",
     }
     self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"])
     self.pattern_type = type(re.compile("\d{2}"))
Exemplo n.º 5
0
kwargs = dict(x.split('=', 1) for x in sys.argv[1:])
bulk = kwargs.pop("bulk", None)
urllink = kwargs.pop(
    "url",
    "http://helsinginsuunnistajat.fi/assets/Kivikko12.10.2014_rastit.txt")
verbose = kwargs.pop("verbose", "False") == "True"
title = kwargs.pop("title", "")
place = kwargs.pop("place", "")
date = kwargs.pop("date", "1.1.1982")
parser_ = kwargs.pop("parser", "recognize")
if parser_ == "regex":
    urlParser = HTTPParser,
    parser = RegexParser(url=urllink,
                         verbose=verbose,
                         date=date,
                         place=place,
                         header=urllink,
                         title=title)
elif parser_ == "regex2":
    urlParser = HTTPParser,
    parser = RegexParser2(url=urllink,
                          verbose=verbose,
                          date=date,
                          place=place,
                          header=urllink,
                          title=title)
else:
    print(urllink)
    urlParser, parser = recognizeParser(urllink,
                                        bulk=bulk,
                                        verbose=verbose,
Exemplo n.º 6
0
 async def collect(self):
     url = "https://getfreeproxylists.blogspot.com/"
     html = await http_client.get_text(url=url)
     return RegexParser().parse(html)
Exemplo n.º 7
0
    state.parse_state(state_file)
    output.parse_outputs(output_file)
    args = parser.parse_args()
    observer = LogObserver(state_file)

    for fl in config.get_files():
        pos = state.pos(fl)
        inode, dev = state.id(fl)
        filters = config.get_filter(fl)
        name = config.get_name(fl)
        retention = config.get_retention(fl)
        out = output.get_output(config.get_output(fl))

        res = []
        for x in filters:
            res.append(RegexParser(x['regex'], x['emit'], x['transform']))

        observer.add(fl, pos, res, inode, dev, out, name, retention)

    observer.start()
    try:
        while True:
            observer.dump_state()
            observer.flush_output()
            time.sleep(STATE_DUMP_TIMEOUT)
    finally:
        logging.debug('finale')
        # print(event_handler1.dump_state())
        observer.stop()
        observer.join()
        observer.dump_state()
Exemplo n.º 8
0
class RegexParserTestCase(TestCase):
    def setUp(self):
        self.regs = {
            "yyyy-mm-dd": "(?P<date>\d{4}-\d{2}-\d{2})",
            "user": "******",
            "team": "(?P<team>\w+)",
            "word": "(?P<word>\w+)",
            "msg": "(?P<msg>.*)",
            "irclog": "{{team}}\|{{yyyy-mm-dd}} \< {{user}}\> {{msg}}",
        }
        self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"])
        self.pattern_type = type(re.compile("\d{2}"))

    def test_compile(self):
        for key in self.parser.use:
            self.assertIsInstance(self.parser.compiled[key], self.pattern_type)

    def test_bad_use_key(self):
        self.assertRaises(Exception, RegexParser, regs=self.regs, use=["notakey"])

    def test_use_not_empty(self):
        self.assertRaises(Exception, RegexParser, self.regs, use=[])

    def test_search(self):
        m1 = self.parser.execute("team1|2012-02-27 < jbruce> testing123")
        self.assertEqual(m1["type"], "irclog")
        self.assertEqual(m1["team"], "team1")
        self.assertEqual(m1["user"], "jbruce")
        self.assertEqual(m1["date"], "2012-02-27")
        self.assertEqual(m1["msg"], "testing123")

    def test_search_cascade(self):
        """
        note this also tests that the first match wins.
        both team and word will match, but team matches first.
        """
        m1 = self.parser.execute("this is a test")
        self.assertEqual(m1["type"], "team")
        self.assertEqual(m1["team"], "this")

    def test_search_no_match(self):
        m1 = self.parser.execute(".%$#@.*")
        self.assertIsNone(m1)
        m1 = self.parser.execute("")
        self.assertIsNone(m1)

    def test_execute_dict(self):
        m1 = self.parser.execute({"msg": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"})
        self.assertEqual(m1["type"], "irclog")
        self.assertEqual(m1["team"], "team1")
        self.assertEqual(m1["user"], "jbruce")
        self.assertEqual(m1["date"], "2012-02-27")
        self.assertEqual(m1["msg"], "testing123")
        self.assertEqual(m1["other"], "should be retained")

    def test_execute_dict_with_parse_field(self):
        self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this")
        m1 = self.parser.execute({"parse_this": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"})
        self.assertEqual(m1["type"], "irclog")
        self.assertEqual(m1["team"], "team1")
        self.assertEqual(m1["user"], "jbruce")
        self.assertEqual(m1["date"], "2012-02-27")
        self.assertEqual(m1["msg"], "testing123")
        self.assertEqual(m1["other"], "should be retained")

    def test_execute_dict_with_parse_field_not_in_data(self):
        self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this")
        m1 = self.parser.execute({"blah": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"})
        self.assertIsNone(m1)
Exemplo n.º 9
0
 def test_execute_dict_with_parse_field_not_in_data(self):
     self.parser = RegexParser(regs=self.regs, use=["irclog", "team", "word"], parse_field="parse_this")
     m1 = self.parser.execute({"blah": "team1|2012-02-27 < jbruce> testing123", "other": "should be retained"})
     self.assertIsNone(m1)