def initialize_db(self): host = CoreConfigure().get_config_section_map('db')['host'] port = CoreConfigure().get_config_section_map('db')['port'] db_name = CoreConfigure().get_config_section_map('db')['database'] collect_name = CoreConfigure().get_config_section_map( 'db')['collection'] self.db_cli = MongoClient(host, int(port)) # mongodb collection self.collection = self.db_cli[db_name][collect_name]
def __init__(self): monkey.patch_all() self.queue = queue.Queue() self.pool = pool.Pool( int(CoreConfigure().get_config_section_map("spider") ['concurrency'])) self.url_table = UrlTable() self.timer = Timer( int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop) self._stop = event.Event() self.greenlet_finished = event.Event() self.root = None # url_object self.initialize_db()
def __init__(self): ## origintreeaccray: build call tree from log file, discard the builded tree ## which children execution time sum less than ## root node total exetime * origintreeaccray self.test_data = [] self.config = CoreConfigure() # default basepath self.accuracy = self.config.get_configure( 'accuracy')['origintreeaccracy'] # read test data filename self.extrator = regularExtrator() self.count = 0 self.finished = False self.filter_datetime_begin = None self.filter_datetime_end = None
def run(self, url=None): begin = time.time() if url is None: # read from configure file for default value url = CoreConfigure().get_config_section_map('content')['root_url'] self.set_root(url) self.timer.start() logger.info("spider begin crawl") while not self.stopped() and self.timer.isAlive(): for greenlet in list(self.pool): if greenlet.dead: self.pool.discard(greenlet) try: url = self.queue.get_nowait() except queue.Empty: if self.pool.free_count() != self.pool.size: # wait until one greenlet finish to flash queue self.greenlet_finished.wait() self.greenlet_finished.clear() continue else: self.stop() greenlet = Handler(url, self) self.pool.start(greenlet) logger.info("total time elapsed %0.2f" % (time.time() - begin))
def __init__(self): ## origintreeaccray: build call tree from log file, discard the builded tree ## which children execution time sum less than ## root node total exetime * origintreeaccray self.test_data = [] self.config = CoreConfigure() # default basepath self.accuracy = self.config.get_configure('accuracy')['origintreeaccracy'] # read test data filename self.extrator = regularExtrator() self.count = 0 self.finished = False self.filter_datetime_begin = None self.filter_datetime_end = None
def get_packages(self): response = requests.get(self.url_object.url) html_packages = PyQuery(response.text) all_a_element = html_packages("a") all_a_package_element = [ package for package in all_a_element if isinstance(package.text, basestring) and package.text.startswith("java.") ] # add all class href to queue for package_element in all_a_package_element: absolute_url = os.path.join( CoreConfigure().get_config_section_map("content") ['package_root'], package_element.attrib['href']) url_object = UrlObj(absolute_url, type=1, package_name=package_element.text, class_name=None, method_name=None) self.spider.queue.put(url_object)
def get_classes(self): response = requests.get(self.url_object.url) html_classes = PyQuery(response.text) blocks = [PyQuery(b) for b in html_classes('li.blockList')] class_block = None for b in blocks: spans = b('span') for span in spans: if span.text == "Class Summary": class_block = b if class_block is None: logger.info("no class found in %s" % self.url_object.url) return base = CoreConfigure().get_config_section_map('content')['class_root'] for cls in class_block('td.colFirst a'): cls_name = cls.text cls_link = cls.attrib['href'] cls_link = self.assemble_url(self.url_object.url, cls_link) url_object = UrlObj(cls_link, type=2, package_name=self.url_object.package_name, class_name=cls_name, method_name=None) self.spider.queue.put(url_object)
class Reader(object): STATE_UNBEGIN = -1 STATE_BEGIN = 0 STATE_GENERAL = 1 def __init__(self): ## origintreeaccray: build call tree from log file, discard the builded tree ## which children execution time sum less than ## root node total exetime * origintreeaccray self.test_data = [] self.config = CoreConfigure() # default basepath self.accuracy = self.config.get_configure('accuracy')['origintreeaccracy'] # read test data filename self.extrator = regularExtrator() self.count = 0 self.finished = False self.filter_datetime_begin = None self.filter_datetime_end = None def execute(self, datetime_begin = None, datetime_end = None, test_data = None): """ method execute is a generator """ # initialize filter data self.filter_datetime_begin = datetime_begin self.filter_datetime_end = datetime_end # initiailize test data if test_data is None: # for debug mode self.test_data = self.get_initial_data() else: # for release mode self.test_data = [os.path.join(settings.MEDIA_ROOT, d) for d in test_data] # current state self.current_state = Reader.STATE_UNBEGIN # initial variable # req_info: whole request infomation contain serviceId, called methods # task execute time and etc # method_lst : contain all methods in a request self.req_info = {} self.method_lst = [] # automatic machine for parser log file to extract request infomation for log_file in self.test_data: try: f = open(log_file, 'r') while True: line = f.readline() if line: # see detail description in regularExtrator file ret_val = self.extrator.extra(line) parse_res = self.state_machine_change(self.current_state, ret_val) if self.finished is True: yield self.req_info self.count = self.count + 1 self.clear_up() else: # whole file is processed already f.close() break except Exception ,e: # handle exception file not exist logger.error(e)
class Reader(object): STATE_UNBEGIN = -1 STATE_BEGIN = 0 STATE_GENERAL = 1 def __init__(self): ## origintreeaccray: build call tree from log file, discard the builded tree ## which children execution time sum less than ## root node total exetime * origintreeaccray self.test_data = [] self.config = CoreConfigure() # default basepath self.accuracy = self.config.get_configure( 'accuracy')['origintreeaccracy'] # read test data filename self.extrator = regularExtrator() self.count = 0 self.finished = False self.filter_datetime_begin = None self.filter_datetime_end = None def execute(self, datetime_begin=None, datetime_end=None, test_data=None): """ method execute is a generator """ # initialize filter data self.filter_datetime_begin = datetime_begin self.filter_datetime_end = datetime_end # initiailize test data if test_data is None: # for debug mode self.test_data = self.get_initial_data() else: # for release mode self.test_data = [ os.path.join(settings.MEDIA_ROOT, d) for d in test_data ] # current state self.current_state = Reader.STATE_UNBEGIN # initial variable # req_info: whole request infomation contain serviceId, called methods # task execute time and etc # method_lst : contain all methods in a request self.req_info = {} self.method_lst = [] # automatic machine for parser log file to extract request infomation for log_file in self.test_data: try: f = open(log_file, 'r') while True: line = f.readline() if line: # see detail description in regularExtrator file ret_val = self.extrator.extra(line) parse_res = self.state_machine_change( self.current_state, ret_val) if self.finished is True: yield self.req_info self.count = self.count + 1 self.clear_up() else: # whole file is processed already f.close() break except Exception, e: # handle exception file not exist logger.error(e)