示例#1
0
 def test_crawl_native_fakeCrawler(self):
     parameter_file = "./test/search_parameters.json"
     c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file))
     self.assertEqual(c.name, "SimpleTest")
     c.crawl_native()
     self.assertTrue(os.path.isfile(parameter_file))
     result_from_file = FileOperations.get_from_JSON_file(c.output["path"])
     self.assertEqual(len(result_from_file), 3)
示例#2
0
 def test_crawl_clientIntegrations(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "Integration"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native()
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
示例#3
0
 def test_crawl_multithread_mmcoreAsync(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "dotAsync"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native(threads=None)
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
示例#4
0
import argparse
from FileOperations import FileOperations as FO
from Crawler import Crawler
import os.path

parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them')
parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True,
                   help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key")
parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json')
parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread')
parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file')

args = parser.parse_args()
if "p" not in args or args.p is None:
    parser.error(parser.format_usage())
param = FO.get_from_JSON_file(args.p.name)
if "rules" not in param or ("o" not in args and "output" not in param):
    print("rules error")
    parser.error(parser.format_usage())
if "crawling" not in param and ("s" not in args or args.s is None):
    parser.error(parser.format_usage())
elif "s" in args and args.s is not None:
    param["crawling"] = { "start": args.s}
if "o" in args and args.o is not None:
    output_name, output_extension = os.path.splitext(args.o.name)
    param["output"] = {
        "path": args.o.name,
        "type": "csv" if ".csv" in output_extension else "json"
    }
if "mt" in args and args.mt is not None:
    Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt)
示例#5
0
 def test_crawl_fake_directCrawl(self):
     parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json")
     Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"])
     self.assertTrue(os.path.isfile(parameters["output"]["path"]))
     result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"])
     self.assertEqual(len(result_from_file), 3)
示例#6
0
 def test_crawl_native_minimalParameterFile_multithreaded_native(self):
     parameters = FileOperations.get_from_JSON_file("./test/minimal_parameters.json")
     data = Crawler.crawl_multithread(parameters["crawling"], parameters["rules"], parameters.get("result"))
     self.assertEqual(data['./test/test_inputs/minimalist_data.txt']['matches']['HasName']['city'][0], 'London')
示例#7
0
 def test_crawl_native_minimalParameterFile_multithreaded(self):
     c = Crawler("MyMinimalCrawler", FileOperations.get_from_JSON_file("./test/minimal_parameters.json"))
     self.assertEqual(c.crawl_native(threads=10)['./test/test_inputs/minimalist_data.txt']['matches']['HasName']['city'][0], 'London')
 def test_getDataFromJSON(self):
     data = FileOperations.get_from_JSON_file("./test/search_parameters.json")
     self.assertIsNotNone(data)
     self.assertEqual(data["result"]["built-in"][2], "AUTHOR")