class Checker: def __init__(self, images_url="http://127.0.0.1:3000/api/images", hub_url="https://hub.docker.com/", amqp_url='amqp://*****:*****@127.0.0.1:5672', exchange="dofinder", queue="images", route_key="images.scan", path_file_logging=None): # For publishing into RabbitMq queue the iamge name self.url_amqp = amqp_url self._exchange = exchange self._route_key = route_key # stream handler logger self.logger = logging.getLogger(__class__.__name__) self.logger.info(__class__.__name__ + " logger initialized") # file handler logger self.path_file_logging = path_file_logging self.file_logger = None if path_file_logging: name_file_logger = __class__.__name__ + "-rotated" self.file_logger = logging.getLogger(name_file_logger) self.file_logger.setLevel(logging.DEBUG) interval = 24 backupCount = 10 # 10 giorni di backup self.logger.info("LOGGING PATH: " + path_file_logging + " every hour=" + str(interval) + " with backupcount=" + str(backupCount)) handler = TimedRotatingFileHandler(path_file_logging, when="h", interval=interval, backupCount=backupCount) #fh = logging.FileHandler(path_file_logging) # fh.setLevel(logging.DEBUG) LOG_FORMAT = ('%(asctime)s %(message)s') formatter = logging.Formatter(LOG_FORMAT) # fh.setFormatter(formatter) # add the file handlers handlers to the logger # self.file_logger.addHandler(fh) handler.setLevel(logging.INFO) handler.setFormatter(formatter) self.file_logger.addHandler(handler) #str(tot_hub_images)+":"+str(tot_dockerfinder_images)+":"+ str(removed)+":"+str(pending)+":"+str(uptodate) self.file_logger.info( "hubtot:dftot:dfremoved:dfpending:dfuptodate") # client of Images Service: in order to add and update the image # description. self.client_images = ClientImages(images_url=images_url) # client of Docker Hub. self.client_hub = ClientHub(docker_hub_endpoint=hub_url) def send_to_rabbitmq(self, msg): connection = pika.BlockingConnection(pika.URLParameters(self.url_amqp)) self.logger.info("connected to " + self.url_amqp) # Open the channel channel = connection.channel() # Declare the queue self.logger.info(self._route_key) channel.basic_publish(exchange=self._exchange, routing_key=self._route_key, body=msg) self.logger.info(msg + " sent to " + self._exchange) connection.close() def check_images(self): """ scan the images. """ checked = {} tot_dockerfinder_images = 0 removed = 0 pending = 0 uptodate = 0 json_res = self.client_images.get_images() tot_dockerfinder_images = json_res['count'] self.logger.info( str(tot_dockerfinder_images) + " images present into local database") tot_hub_images = self.client_hub.count_all_images() self.logger.info( str(tot_hub_images) + ": images present into Docker Hub") images = json_res['images'] for image in images: name = image['name'] #splitname = image['name'].split(":") repo = image['repo_name'] # splitname[0] tag = image['tag'] # splitname[1] image_id = image['_id'] if self.client_hub.is_alive_in_hub(repo, tag=tag): self.logger.debug("[" + name + "] status:" + image['status']) if image['status'] == "updated": # str(datetime.datetime.min) if self.client_images.must_scanned( name, image['last_updated'] if image['last_updated'] else "2000-01-01T00:00:00.000Z"): self.logger.debug("[" + name + "] must be scanned again.") self.client_images.update_status( image_id, "pending") # Set status to Pending self.logger.info("[" + name + "] from UPDATED to PENDING status.") self.send_to_rabbitmq( json.dumps({ "name": name, "repo_name": repo, "tag": tag })) self.logger.info("[" + name + "] requeud into queue.") # checked['pending'].append(name) pending += 1 else: self.logger.info("[" + name + "] remains UPDATED status.") uptodate += 1 if image['status'] == "pending": self.logger.info("[" + name + "] remains PENDING status") pending += 1 else: # the image is removed from the database if it is not present # into Docker Hub self.logger.info( "[" + name + "] deleted from local db because it is not alive in DockerHub" ) self.client_images.delete_image(image_id) removed += 1 if self.path_file_logging: self.file_logger.info( str(tot_hub_images) + ":" + str(tot_dockerfinder_images) + ":" + str(removed) + ":" + str(pending) + ":" + str(uptodate)) assert tot_dockerfinder_images == (pending + uptodate + removed) self.logger.info("Removed=" + str(removed) + "; pending=" + str(pending) + " up-to-date=" + str(uptodate)) def verify_images(self): """ Scan all the images in the local databse and fix the problems 1) having the ".go" or "." version of a software. 2) is_private = null, 3) is_automated = nul by updating the boolean value from Doker Hub. """ json_res = self.client_images.get_images() tot_dockerfinder_images = json_res['count'] self.logger.info( str(tot_dockerfinder_images) + " images present into local database") images = json_res['images'] updated = 0 for image in images: name = image['name'] splitname = image['name'].split(":") repo = splitname[0] tag = splitname[1] json_response = self.client_hub.get_json_repo(repo) if json_response: if "is_automated" in json_response: image['is_automated'] = json_response['is_automated'] if "is_private" in json_response: image['is_private'] = json_response['is_private'] softwares = image['softwares'] self.logger.info("before: {0}".format(softwares)) # [0-9]+[.][0-9]*[.0-9] softwares = [sw for sw in softwares if sw['ver'] != '.'] softwares = [sw for sw in softwares if sw['ver'] != ".go"] # for sw in softwares: # if ".go" in sw['ver'] or sw['ver'] == ".": # self.logger.info("removing {0}:{1}".format(sw['software'], sw['ver'])) # softwares.remove(sw) self.logger.info("after: {0}".format(softwares)) image['softwares'] = softwares # PUT the new image description of the image self.client_images.put_image(image) updated += 1 self.logger.info( "UPDATED [" + name + "]. {0}/{1}".format(updated, tot_dockerfinder_images)) def run(self, interval_next_check): self.logger.info("Starting the checker module...") while True: try: self.check_images() time.sleep(interval_next_check) except Exception as e: self.logger.error(str(e)) self.logger.error("Waiting 5s and restarting.") time.sleep(5)
class Crawler: def __init__(self, exchange="dofinder", queue="images", route_key="images.scan", amqp_url='amqp://*****:*****@127.0.0.1:5672', images_url="http://127.0.0.1:3000/api/images", hub_url="https://hub.docker.com", path_last_url="/data/crawler/lasturl.txt", policy="none", min_stars=0, min_pulls=0, only_automated=False, only_official=False): self.logger = logging.getLogger(__class__.__name__) self.logger.info(__class__.__name__ + " logger initialized") # publish the images downloaded into the rabbitMQ server. self.publisher = PublisherRabbit(amqp_url, exchange=exchange, queue=queue, route_key=route_key) self.logger.info("RabbitMQ : exchange=" + exchange + ", queue=" + queue + " route key=" + route_key) # Client of Docker Hub. self.client_hub = ClientHub(docker_hub_endpoint=hub_url, path_last_url=path_last_url) # client of Images Service: if an image is NEW it is sent to queue, # otherwise it is discarded self.client_images = ClientImages(images_url=images_url) # ordering = {"stars":"star_count", "-stars":"-star_count", # "pulls":"pull_count", "-pulls":"-pull_count"} ordi = { "stars_first": "-star_count", "pulls_first": "-pull_count", "none": None } # {"stars_first":"-star_count", "pulls_first" : "-pull_count", # "none":None}.get(policy, None) self.ordering = ordi[policy] self.policy = policy self.min_stars = min_stars self.min_pulls = min_pulls self.only_automated = only_automated self.only_official = only_official self.logger.info( "Crawler: ordering={}, policy={}, min_stars={}, min_pulls={}, only_official={}, only_automated={}" .format(self.ordering, self.policy, min_stars, min_pulls, only_official, only_automated)) # , max_images=None): def run(self, from_page, page_size, num_samples=None, at_random=False, force_from_page=False): """ Starts the publisher of the RabbitMQ server, and send to the images crawled with the crawl() method. If num_sample != None: if at_random: crawls num_samples images using the random sampling method. else: crawl num_samples images in order else : crawls all the images from the Docker Hub., :param from_page: the starting page into the Docker Hub. :param page_size: the number of images in a single page. :param max_images: the number of images name to downloads. :return: """ try: if num_samples != None: if at_random: #self.publisher.run(images_generator_function=self.crawl(from_page=from_page, page_size=page_size, max_images=max_images)) self.publisher.run( images_generator_function=self.crawl_random_samples( num_samples, force_from_page, from_page=from_page, page_size=page_size)) # , max_images=max_images)) else: self.publisher.run(images_generator_function=self.crawl( force_from_page=force_from_page, from_page=from_page, page_size=page_size, max_images=num_samples)) # , max_images=max_images)) else: self.publisher.run(images_generator_function=self.crawl( force_from_page=force_from_page, from_page=from_page, page_size=page_size, max_images=None)) except KeyboardInterrupt: self.publisher.stop() def crawl_random_samples(self, m_samples, force_from_page, from_page, page_size): """ This is a generator function that crawls docker images name at random name the Docker HUb. The following random sampling of a kNOWN STREAM is used. s = 0 //number of item selected for (j=1 ; j <= n; j++) p = Rand(0,1) if (p <= (m-s)/ n-j+1): select S[j]; s++ :param m_samples: number of sampled images, :param from_page: the starting page into the Docker Hub. :param page_size: is the number of images per image that Docker Hub return. :param max_images: the number of images to download. :return: generator of JSON images description """ # TODO : max_images and num_samples are different . Max _images tell to Docker Hub CLient the max number of images to be dowlodes # num_samples is the number of images to be sampled into Docker Hub. sent_images = 0 max_images = None previous_num_sampled = 0 # only for logging the sampled images when the number cheange j = 0 # number of total imags passed thorugh the stream # total number of images stored within Docker Hub num_images = self.client_hub.count_all_images() # TODO: not all the images into the Dokcer Hub are downaloded if the # filter function is executed self.logger.info("Random sampling activated. \n\t\tTarget samples:" + str(m_samples) + ", Total number of images: " + str(num_images) + "\n\t\tPercentage:" + str(m_samples / num_images)) for list_images in self.client_hub.crawl_images( from_page=from_page, page_size=page_size, max_images=num_images, force_from_page=force_from_page, sort=self.ordering): # filter_images=self.filter_tag_latest): previous_num_sampled = sent_images # set the previous sent images for image in list_images: # Random sampling over a stream of images j += 1 # if j <= num_images : # otherwise division by zero p = random.uniform(0, 1) # 0 <= p <= 1 # if (p <= (m-s)/ n-j+1): if p <= (m_samples - sent_images) / (num_images - j + 1): repo_name = image['repo_name'] sent_images += 1 yield json.dumps({"name": repo_name}) if sent_images > previous_num_sampled: self.logger.info( "{0}/{1} (Current samples/Target samples)".format( str(sent_images), str(m_samples))) self.logger.info("Total sampled images: {0}".format(str(sent_images))) def crawl(self, force_from_page, from_page, page_size, max_images=None): """ The crawl() is a generator function. It crawls the docker images name from the Docker HUb. It yeld a JSON of the image. :param from_page: the starting page into the Docker Hub. :param page_size: is the number of images per image that Docker Hub return. :param max_images: the number of images to download. :return: generator of JSON images description """ sent_images = 0 #count = self.client_hub.count_all_images() #max_images = count if not max_images else max_images count = self.client_hub.count_all_images() if max_images is None: max_images = count self.logger.info("Consecutive sampling activated. \n\t\tTarget :" + str(max_images) + ", Total images: " + str(count) + "\n\t\tPercentage:" + str(max_images / count)) else: self.logger.info("Consecutive sampling activated. \n\t\tTarget :" + str(max_images) + ". Total images: " + str(count) + "\n\t\tPercentage:" + str(max_images / count)) for image in self.client_hub.crawl_images( from_page=from_page, page_size=page_size, max_images=max_images, force_from_page=force_from_page, sort=self.ordering, # filter_image_tag=self.filter_tag # filter_tag=self.filter_latest, filter_repo=self.filter_tosker): sent_images += 1 if sent_images % 100 == 0: self.logger.info( "{0} number of images sent to analyser".format( sent_images)) yield json.dumps(image) # json.dumps({"name": repo_name}) #self.logger.info("{0}/{1} (Current samples/Target samples)".format(str(sent_images), str(count))) #self.logger.info("Number of images sent to queue: {0}".format(str(sent_images))) #self.logger.info("{0}/{1} (Current samples/Target samples)".format(str(sent_images), str(count))) self.logger.info( "Total num of images sent to queue: {0}".format(sent_images)) def filter_latest(self, image_with_tag): """ Filters the images with the *latest* tag. An image is sento to the rabbitMQ only of it is new into the local database, otherwise it is discarded (The checker is in charge to requeue the images that are not updated) :param repo_name: the name of a repository :return: True if the image must be downloaded, Flase if must be discarded """ process_image = False self.logger.debug("[" + image_with_tag['tag'] + "] filtering lates tag processing image.") return image_with_tag['tag'] == "latest" def filter_tosker(self, image): # self.logger.info(dir(self)) # self.policy = policy # self.min_stars = min_stars # self.min_pulls = min_pulls # self.only_automated = only_automated # self.only_official = only_official select_image = True stars = image['star_count'] pulls = image['pull_count'] is_automated = image['is_automated'] is_official = image['is_official'] # self.logger.info(self.only_automated) # self.logger.info(self.only_official) if stars < self.min_stars: #self.logger.info("stars {} {}" .format(stars,self.min_stars ) ) select_image = False if pulls < self.min_pulls: #self.logger.info("pulls {} {}" .format(stars,self.min_stars ) ) select_image = False if self.only_automated == True: if is_automated == False: self.logger.debug("not automated ") select_image = False if self.only_official == True: if is_official == False: # True: self.logger.debug("not official ") select_image = False return select_image