Exemplo n.º 1
0
 def initialize_db(self):
     host = CoreConfigure().get_config_section_map('db')['host']
     port = CoreConfigure().get_config_section_map('db')['port']
     db_name = CoreConfigure().get_config_section_map('db')['database']
     collect_name = CoreConfigure().get_config_section_map(
         'db')['collection']
     self.db_cli = MongoClient(host, int(port))
     # mongodb collection
     self.collection = self.db_cli[db_name][collect_name]
Exemplo n.º 2
0
 def __init__(self):
     monkey.patch_all()
     self.queue = queue.Queue()
     self.pool = pool.Pool(
         int(CoreConfigure().get_config_section_map("spider")
             ['concurrency']))
     self.url_table = UrlTable()
     self.timer = Timer(
         int(CoreConfigure().get_config_section_map("spider")['timeout']),
         self.stop)
     self._stop = event.Event()
     self.greenlet_finished = event.Event()
     self.root = None  # url_object
     self.initialize_db()
Exemplo n.º 3
0
 def __init__(self):
     ## origintreeaccray: build call tree from log file, discard the builded tree
     ##                   which children execution time sum less than
     ##                   root node total exetime * origintreeaccray
     self.test_data = []
     self.config = CoreConfigure()
     # default basepath
     self.accuracy = self.config.get_configure(
         'accuracy')['origintreeaccracy']
     # read test data filename
     self.extrator = regularExtrator()
     self.count = 0
     self.finished = False
     self.filter_datetime_begin = None
     self.filter_datetime_end = None
Exemplo n.º 4
0
 def run(self, url=None):
     begin = time.time()
     if url is None:
         # read from configure file for default value
         url = CoreConfigure().get_config_section_map('content')['root_url']
     self.set_root(url)
     self.timer.start()
     logger.info("spider begin crawl")
     while not self.stopped() and self.timer.isAlive():
         for greenlet in list(self.pool):
             if greenlet.dead:
                 self.pool.discard(greenlet)
         try:
             url = self.queue.get_nowait()
         except queue.Empty:
             if self.pool.free_count() != self.pool.size:
                 # wait until one greenlet finish to flash queue
                 self.greenlet_finished.wait()
                 self.greenlet_finished.clear()
                 continue
             else:
                 self.stop()
         greenlet = Handler(url, self)
         self.pool.start(greenlet)
     logger.info("total time elapsed %0.2f" % (time.time() - begin))
Exemplo n.º 5
0
 def __init__(self):
     ## origintreeaccray: build call tree from log file, discard the builded tree
     ##                   which children execution time sum less than
     ##                   root node total exetime * origintreeaccray
     self.test_data = []
     self.config = CoreConfigure()
     # default basepath
     self.accuracy = self.config.get_configure('accuracy')['origintreeaccracy']
     # read test data filename
     self.extrator = regularExtrator()
     self.count = 0
     self.finished = False
     self.filter_datetime_begin = None
     self.filter_datetime_end = None
Exemplo n.º 6
0
 def get_packages(self):
     response = requests.get(self.url_object.url)
     html_packages = PyQuery(response.text)
     all_a_element = html_packages("a")
     all_a_package_element = [
         package for package in all_a_element
         if isinstance(package.text, basestring)
         and package.text.startswith("java.")
     ]
     # add all class href to queue
     for package_element in all_a_package_element:
         absolute_url = os.path.join(
             CoreConfigure().get_config_section_map("content")
             ['package_root'], package_element.attrib['href'])
         url_object = UrlObj(absolute_url,
                             type=1,
                             package_name=package_element.text,
                             class_name=None,
                             method_name=None)
         self.spider.queue.put(url_object)
Exemplo n.º 7
0
 def get_classes(self):
     response = requests.get(self.url_object.url)
     html_classes = PyQuery(response.text)
     blocks = [PyQuery(b) for b in html_classes('li.blockList')]
     class_block = None
     for b in blocks:
         spans = b('span')
         for span in spans:
             if span.text == "Class Summary":
                 class_block = b
     if class_block is None:
         logger.info("no class found in %s" % self.url_object.url)
         return
     base = CoreConfigure().get_config_section_map('content')['class_root']
     for cls in class_block('td.colFirst a'):
         cls_name = cls.text
         cls_link = cls.attrib['href']
         cls_link = self.assemble_url(self.url_object.url, cls_link)
         url_object = UrlObj(cls_link,
                             type=2,
                             package_name=self.url_object.package_name,
                             class_name=cls_name,
                             method_name=None)
         self.spider.queue.put(url_object)
Exemplo n.º 8
0
class Reader(object):
    STATE_UNBEGIN = -1
    STATE_BEGIN = 0
    STATE_GENERAL = 1
    def __init__(self):
        ## origintreeaccray: build call tree from log file, discard the builded tree
        ##                   which children execution time sum less than
        ##                   root node total exetime * origintreeaccray
        self.test_data = []
        self.config = CoreConfigure()
        # default basepath
        self.accuracy = self.config.get_configure('accuracy')['origintreeaccracy']
        # read test data filename
        self.extrator = regularExtrator()
        self.count = 0
        self.finished = False
        self.filter_datetime_begin = None
        self.filter_datetime_end = None
    def execute(self, datetime_begin = None, datetime_end = None, test_data = None):
        """
            method execute is a generator
        """
        # initialize filter data
        self.filter_datetime_begin = datetime_begin
        self.filter_datetime_end = datetime_end
        # initiailize test data
        if test_data is None:
            # for debug mode
            self.test_data = self.get_initial_data()
        else:
            # for release mode
            self.test_data = [os.path.join(settings.MEDIA_ROOT, d) for d in test_data]
        # current state
        self.current_state = Reader.STATE_UNBEGIN

        # initial variable
        # req_info: whole request infomation contain serviceId, called methods
        #           task execute time and etc
        # method_lst : contain all methods in a request

        self.req_info = {}
        self.method_lst = []

        # automatic machine for parser log file to extract request infomation
        for log_file in self.test_data:
            try:
                f = open(log_file, 'r')
                while True:
                    line  = f.readline()
                    if line:
                        # see detail description in regularExtrator file
                        ret_val = self.extrator.extra(line)
                        parse_res = self.state_machine_change(self.current_state, ret_val)
                        if self.finished is True:
                            yield self.req_info
                            self.count = self.count + 1
                            self.clear_up()
                    else:
                        # whole file is processed already
                        f.close()
                        break
            except Exception ,e:
                # handle exception file not exist
                logger.error(e)
Exemplo n.º 9
0
class Reader(object):
    STATE_UNBEGIN = -1
    STATE_BEGIN = 0
    STATE_GENERAL = 1

    def __init__(self):
        ## origintreeaccray: build call tree from log file, discard the builded tree
        ##                   which children execution time sum less than
        ##                   root node total exetime * origintreeaccray
        self.test_data = []
        self.config = CoreConfigure()
        # default basepath
        self.accuracy = self.config.get_configure(
            'accuracy')['origintreeaccracy']
        # read test data filename
        self.extrator = regularExtrator()
        self.count = 0
        self.finished = False
        self.filter_datetime_begin = None
        self.filter_datetime_end = None

    def execute(self, datetime_begin=None, datetime_end=None, test_data=None):
        """
            method execute is a generator
        """
        # initialize filter data
        self.filter_datetime_begin = datetime_begin
        self.filter_datetime_end = datetime_end
        # initiailize test data
        if test_data is None:
            # for debug mode
            self.test_data = self.get_initial_data()
        else:
            # for release mode
            self.test_data = [
                os.path.join(settings.MEDIA_ROOT, d) for d in test_data
            ]
        # current state
        self.current_state = Reader.STATE_UNBEGIN

        # initial variable
        # req_info: whole request infomation contain serviceId, called methods
        #           task execute time and etc
        # method_lst : contain all methods in a request

        self.req_info = {}
        self.method_lst = []

        # automatic machine for parser log file to extract request infomation
        for log_file in self.test_data:
            try:
                f = open(log_file, 'r')
                while True:
                    line = f.readline()
                    if line:
                        # see detail description in regularExtrator file
                        ret_val = self.extrator.extra(line)
                        parse_res = self.state_machine_change(
                            self.current_state, ret_val)
                        if self.finished is True:
                            yield self.req_info
                            self.count = self.count + 1
                            self.clear_up()
                    else:
                        # whole file is processed already
                        f.close()
                        break
            except Exception, e:
                # handle exception file not exist
                logger.error(e)