def __init__(self, amqp_url='amqp://*****:*****@', exchange=None, queue=None, route_key=None, software_url="", images_url="", hub_url="", rmi=True): self.rmi = rmi # remove an image after it is scanned self.logger = logging.getLogger(__class__.__name__) + " logger initialized") # client of Software service: the service that return the software to # search in the images. self.client_software = ClientSoftware(api_url=software_url) # client of Docker daemon running on the local host self.client_daemon = docker.DockerClient( base_url='unix://var/run/docker.sock') # rabbit consumer of RabbittMQ: receives the images name to scan, # on_message_callback is called when a message is received self.consumer = ConsumerRabbit(amqp_url=amqp_url, exchange=exchange, queue=queue, route_key=route_key, on_msg_callback=self.on_message) # client of Images Service: in order to add and update the images self.client_images = ClientImages(images_url=images_url)
def __init__(self, images_url="", hub_url="", amqp_url='amqp://*****:*****@', exchange="dofinder", queue="images", route_key="images.scan", path_file_logging=None): # For publishing into RabbitMq queue the iamge name self.url_amqp = amqp_url self._exchange = exchange self._route_key = route_key # stream handler logger self.logger = logging.getLogger(__class__.__name__) + " logger initialized") # file handler logger self.path_file_logging = path_file_logging self.file_logger = None if path_file_logging: name_file_logger = __class__.__name__ + "-rotated" self.file_logger = logging.getLogger(name_file_logger) self.file_logger.setLevel(logging.DEBUG) interval = 24 backupCount = 10 # 10 giorni di backup"LOGGING PATH: " + path_file_logging + " every hour=" + str(interval) + " with backupcount=" + str(backupCount)) handler = TimedRotatingFileHandler(path_file_logging, when="h", interval=interval, backupCount=backupCount) #fh = logging.FileHandler(path_file_logging) # fh.setLevel(logging.DEBUG) LOG_FORMAT = ('%(asctime)s %(message)s') formatter = logging.Formatter(LOG_FORMAT) # fh.setFormatter(formatter) # add the file handlers handlers to the logger # self.file_logger.addHandler(fh) handler.setLevel(logging.INFO) handler.setFormatter(formatter) self.file_logger.addHandler(handler) #str(tot_hub_images)+":"+str(tot_dockerfinder_images)+":"+ str(removed)+":"+str(pending)+":"+str(uptodate) "hubtot:dftot:dfremoved:dfpending:dfuptodate") # client of Images Service: in order to add and update the image # description. self.client_images = ClientImages(images_url=images_url) # client of Docker Hub. self.client_hub = ClientHub(docker_hub_endpoint=hub_url)
def __init__(self, exchange="dofinder", queue="images", route_key="images.scan", amqp_url='amqp://*****:*****@', images_url="", hub_url="", path_last_url="/data/crawler/lasturl.txt", policy="none", min_stars=0, min_pulls=0, only_automated=False, only_official=False): self.logger = logging.getLogger(__class__.__name__) + " logger initialized") # publish the images downloaded into the rabbitMQ server. self.publisher = PublisherRabbit(amqp_url, exchange=exchange, queue=queue, route_key=route_key)"RabbitMQ : exchange=" + exchange + ", queue=" + queue + " route key=" + route_key) # Client of Docker Hub. self.client_hub = ClientHub(docker_hub_endpoint=hub_url, path_last_url=path_last_url) # client of Images Service: if an image is NEW it is sent to queue, # otherwise it is discarded self.client_images = ClientImages(images_url=images_url) # ordering = {"stars":"star_count", "-stars":"-star_count", # "pulls":"pull_count", "-pulls":"-pull_count"} ordi = { "stars_first": "-star_count", "pulls_first": "-pull_count", "none": None } # {"stars_first":"-star_count", "pulls_first" : "-pull_count", # "none":None}.get(policy, None) self.ordering = ordi[policy] self.policy = policy self.min_stars = min_stars self.min_pulls = min_pulls self.only_automated = only_automated self.only_official = only_official "Crawler: ordering={}, policy={}, min_stars={}, min_pulls={}, only_official={}, only_automated={}" .format(self.ordering, self.policy, min_stars, min_pulls, only_official, only_automated))
class Checker: def __init__(self, images_url="", hub_url="", amqp_url='amqp://*****:*****@', exchange="dofinder", queue="images", route_key="images.scan", path_file_logging=None): # For publishing into RabbitMq queue the iamge name self.url_amqp = amqp_url self._exchange = exchange self._route_key = route_key # stream handler logger self.logger = logging.getLogger(__class__.__name__) + " logger initialized") # file handler logger self.path_file_logging = path_file_logging self.file_logger = None if path_file_logging: name_file_logger = __class__.__name__ + "-rotated" self.file_logger = logging.getLogger(name_file_logger) self.file_logger.setLevel(logging.DEBUG) interval = 24 backupCount = 10 # 10 giorni di backup"LOGGING PATH: " + path_file_logging + " every hour=" + str(interval) + " with backupcount=" + str(backupCount)) handler = TimedRotatingFileHandler(path_file_logging, when="h", interval=interval, backupCount=backupCount) #fh = logging.FileHandler(path_file_logging) # fh.setLevel(logging.DEBUG) LOG_FORMAT = ('%(asctime)s %(message)s') formatter = logging.Formatter(LOG_FORMAT) # fh.setFormatter(formatter) # add the file handlers handlers to the logger # self.file_logger.addHandler(fh) handler.setLevel(logging.INFO) handler.setFormatter(formatter) self.file_logger.addHandler(handler) #str(tot_hub_images)+":"+str(tot_dockerfinder_images)+":"+ str(removed)+":"+str(pending)+":"+str(uptodate) "hubtot:dftot:dfremoved:dfpending:dfuptodate") # client of Images Service: in order to add and update the image # description. self.client_images = ClientImages(images_url=images_url) # client of Docker Hub. self.client_hub = ClientHub(docker_hub_endpoint=hub_url) def send_to_rabbitmq(self, msg): connection = pika.BlockingConnection(pika.URLParameters(self.url_amqp))"connected to " + self.url_amqp) # Open the channel channel = # Declare the queue channel.basic_publish(exchange=self._exchange, routing_key=self._route_key, body=msg) + " sent to " + self._exchange) connection.close() def check_images(self): """ scan the images. """ checked = {} tot_dockerfinder_images = 0 removed = 0 pending = 0 uptodate = 0 json_res = self.client_images.get_images() tot_dockerfinder_images = json_res['count'] str(tot_dockerfinder_images) + " images present into local database") tot_hub_images = self.client_hub.count_all_images() str(tot_hub_images) + ": images present into Docker Hub") images = json_res['images'] for image in images: name = image['name'] #splitname = image['name'].split(":") repo = image['repo_name'] # splitname[0] tag = image['tag'] # splitname[1] image_id = image['_id'] if self.client_hub.is_alive_in_hub(repo, tag=tag): self.logger.debug("[" + name + "] status:" + image['status']) if image['status'] == "updated": # str(datetime.datetime.min) if self.client_images.must_scanned( name, image['last_updated'] if image['last_updated'] else "2000-01-01T00:00:00.000Z"): self.logger.debug("[" + name + "] must be scanned again.") self.client_images.update_status( image_id, "pending") # Set status to Pending"[" + name + "] from UPDATED to PENDING status.") self.send_to_rabbitmq( json.dumps({ "name": name, "repo_name": repo, "tag": tag }))"[" + name + "] requeud into queue.") # checked['pending'].append(name) pending += 1 else:"[" + name + "] remains UPDATED status.") uptodate += 1 if image['status'] == "pending":"[" + name + "] remains PENDING status") pending += 1 else: # the image is removed from the database if it is not present # into Docker Hub "[" + name + "] deleted from local db because it is not alive in DockerHub" ) self.client_images.delete_image(image_id) removed += 1 if self.path_file_logging: str(tot_hub_images) + ":" + str(tot_dockerfinder_images) + ":" + str(removed) + ":" + str(pending) + ":" + str(uptodate)) assert tot_dockerfinder_images == (pending + uptodate + removed)"Removed=" + str(removed) + "; pending=" + str(pending) + " up-to-date=" + str(uptodate)) def verify_images(self): """ Scan all the images in the local databse and fix the problems 1) having the ".go" or "." version of a software. 2) is_private = null, 3) is_automated = nul by updating the boolean value from Doker Hub. """ json_res = self.client_images.get_images() tot_dockerfinder_images = json_res['count'] str(tot_dockerfinder_images) + " images present into local database") images = json_res['images'] updated = 0 for image in images: name = image['name'] splitname = image['name'].split(":") repo = splitname[0] tag = splitname[1] json_response = self.client_hub.get_json_repo(repo) if json_response: if "is_automated" in json_response: image['is_automated'] = json_response['is_automated'] if "is_private" in json_response: image['is_private'] = json_response['is_private'] softwares = image['softwares']"before: {0}".format(softwares)) # [0-9]+[.][0-9]*[.0-9] softwares = [sw for sw in softwares if sw['ver'] != '.'] softwares = [sw for sw in softwares if sw['ver'] != ".go"] # for sw in softwares: # if ".go" in sw['ver'] or sw['ver'] == ".": #"removing {0}:{1}".format(sw['software'], sw['ver'])) # softwares.remove(sw)"after: {0}".format(softwares)) image['softwares'] = softwares # PUT the new image description of the image self.client_images.put_image(image) updated += 1 "UPDATED [" + name + "]. {0}/{1}".format(updated, tot_dockerfinder_images)) def run(self, interval_next_check):"Starting the checker module...") while True: try: self.check_images() time.sleep(interval_next_check) except Exception as e: self.logger.error(str(e)) self.logger.error("Waiting 5s and restarting.") time.sleep(5)
class Scanner: def __init__(self, amqp_url='amqp://*****:*****@', exchange=None, queue=None, route_key=None, software_url="", images_url="", hub_url="", rmi=True): self.rmi = rmi # remove an image after it is scanned self.logger = logging.getLogger(__class__.__name__) + " logger initialized") # client of Software service: the service that return the software to # search in the images. self.client_software = ClientSoftware(api_url=software_url) # client of Docker daemon running on the local host self.client_daemon = docker.DockerClient( base_url='unix://var/run/docker.sock') # rabbit consumer of RabbittMQ: receives the images name to scan, # on_message_callback is called when a message is received self.consumer = ConsumerRabbit(amqp_url=amqp_url, exchange=exchange, queue=queue, route_key=route_key, on_msg_callback=self.on_message) # client of Images Service: in order to add and update the images self.client_images = ClientImages(images_url=images_url) # client of Docker Hub. # self.client_hub = ClientHub(docker_hub_endpoint=hub_url) def run(self): """Start the scanner running the consumer client of the RabbitMQ server.""" try: except KeyboardInterrupt: self.consumer.stop() def on_message(self, json_image): """ This is the CALLBACK function that is called when the consumer Rabbit receives a message. """ image = Image(from_dict_image=json_image)"Received Image {}".format(image)) # first method called when an image name is received attempt = 1 processed = False while attempt < 4 and not processed: try:"[{}] start scan".format( self.process_repo_name(image) processed = True except (docker.errors.APIError, docker.errors.NotFound) as e: # docker.errors.NotFound: self.logger.error(str(e) + ": retry number " + str(attempt)) attempt += 1 except Exception as e: excepName = type( e).__name__ # returns the name of the exception self.logger.error(" {} excpetion: {}".format( excepName, str(e))) attempt += 1 if processed is False: self.logger.warning("{} - PURGER from the queue after {}" "attempt".format(, attempt)) return processed def process_repo_name(self, image): """ Process a single image. It checks if an image must Scanned or it is already updated. """ self.logger.debug("[{}] process repo".format( if self.client_images.is_new("[{}] is new into local database".format( self.scan(image) self.client_images.post_image(image.to_dict()) "[{}] - added to images server succesfully".format( # TODO: Non chiama Docker Hub ??? Ma chiama le images servers per capire se image must be scan again elif self.client_images.must_scanned(, image.last_updated): "[{}] is present into images server but must be scan again". format( self.scan(image)"[{}] - PUT to images server".format( self.client_images.put_image(image.to_dict()) # PUT the new image "[{}] - updated into images server succesfully".format( else:"[{}] - uptodate into images server".format( def scan(self, image): """ It scans an image and create the new Docker finder description. """ # repo_name = image.repo_name # tag = image.tag # repo_name_tag ="[{}] pulling the image ...".format( try: img = self.client_daemon.images.pull( self.logger.debug('[{0}] start scanning'.format( # search software versions and system commands'[{0}] extracing softwares versions ...'.format( self.info_dofinder(image) # add informatiom from the inspect command'[{0}] adding docker inspect info....'.format( self.info_inspect(image)'[{0}] finish scanning'.format( image.last_scan = str( # set updated time image.set_updated() if self.rmi: self.client_daemon.images.remove(, force=True)'[{0}] removed image'.format( except (docker.errors.APIError, docker.errors.NotFound) as e: self.logger.error(str(e)) self.client_daemon.images.remove(, force=True) return image def info_dofinder(self, image): """ Extracts the OS distribution and the software versions in the image """ name = self.logger.debug('[{}] searching software ... '.format(name)) # create the container entrypoint = "sleep 1000000000" # "ping" | ping -i 10000" self.logger.debug("[{}] creating container with entrypoint ={}".format( name, entrypoint)) try: container = self.client_daemon.containers.create( image=name, entrypoint=entrypoint) # start the container with sleep container.start() image.softwares = self._extract_softwares(container) image.distro = self._extract_distribution(container) container.stop(timeout=1) container.remove(v=True) except (docker.errors.ImageNotFound, docker.errors.APIError) as e: container.remove(v=True, force=True) self.logger.error(str(e)) raise # # remove the contatiner # self.client_daemon.remove_container(container_id, force=True, v=True) #'[{}] : found {} softwares [{}] '.format( #, len(softwares), softwares)) def _extract_distribution(self, container): for command, regex in self.client_software.get_system(): res = container.exec_run(cmd=command) output = res.decode() prog = re.compile(regex) match = if match: distro ="{0} found.".format(distro)) return distro else: self.logger.debug("[{0}] NOT found in ".format(command)) return None def _extract_softwares(self, container): # list of software distributions found in the image. softwares = [] for sw in self.client_software.get_software(): # "name":"python", "cmd":"--version", "regex":"[0-9]+\\.[0-9]+(\\.[0-9]+)*" command = sw['name'] + " " + sw['cmd'] res = container.exec_run(cmd=command) output = res.decode() prog = re.compile(sw['regex']) match = if match: version = if version != "." or version != ".go": softwares.append({'software': sw['name'], 'ver': version}) self.logger.debug("{0} {1} found.".format( sw['name'], version)) else: self.logger.debug("[{0}] NOT found in ".format(sw['name']))"{} software found".format(len(softwares))) self.logger.debug('Software found: [' + ''.join('{} {},'.format(s['software'], s['ver']) for s in softwares) + "]") return softwares def info_inspect(self, image): # "inspect_info":{ # "Id":"sha256:702ffd5274797d4cf4b47ac9f4d48cc470ebed1c668a0a2f7e7f1ef493210a65", # "RepoTags":[ ], # "RepoDigests":[ ], # "Parent":"", # "Comment":"", # "Created":"2018-02-26T10:59:06.882767996Z", # "Container":"aaaa08869053814f0a37c5829f74f64f0d5b781e3527572addddf861e5b20376", # "ContainerConfig":{ }, # "DockerVersion":"17.06.1-ce", # "Author":"", # "Config":{ }, # "Architecture":"amd64", # "Os":"linux", # "Size":374134402, # "VirtualSize":374134402, # "GraphDriver":{ }, # "RootFS":{ } # } self.logger.debug('[{}] $docker inspect <image>'.format( client = docker.APIClient(base_url='unix://var/run/docker.sock') json_inspect = client.inspect_image( ) # Usign Low-level client docker because docker 2.0 has not "inspect_image" method wanted_keys = [ 'Id', 'RepoTags', 'RepoDigests', "Parent", "DockerVersion", "Size", "GraphDriver", "RootFS", "VirtualSize", "Architecture", "Os" ] # The keys you want image.inspect_info = dict( (k, json_inspect[k]) for k in wanted_keys if k in json_inspect) # image.inspect_info = json_inspect def version_from_regex(self, container_id, command, regex): try: output = self.run_command(container_id, command) p = re.compile(regex) match = if match: version = if version != "." or version != ".go": self.logger.debug("[{0}] found in {1}".format( command, container_id)) return version else: return None else: self.logger.debug("[{0}] NOT found in {1}".format( command, container_id)) return None except docker.errors.NotFound as e: self.logger.debug(command + " not found") def run_command(self, container_id, command): """Just like 'docker run CMD'. Return the output of the command.i """ self.logger.debug("[{0}] running command {1}".format( container_id, command)) created_exec = self.client_daemon.exec_create(container_id, cmd=command) output = self.client_daemon.exec_start(created_exec['Id']) return output.decode()