Exemplo n.º 1
0
class Tester:
    def __init__(self, path_file_images="images.test",  hub_url="https://hub.docker.com/"):
        self._path = path_file_images
        self.crawler = Crawler()
        self.logger = logging.getLogger(__class__.__name__)
        self.logger.info(__class__.__name__ + " logger  initialized")
        # the client hub interacts with the docker Hub registry
        self.client_hub = ClientHub(docker_hub_endpoint=hub_url,path_last_url=None )
        self.client_daemon = docker.Client(base_url='unix://var/run/docker.sock')

    def build_test(self, num_images_test=100, from_page=1, page_size=10,):
        list_json_images = [image_json for image_json in self.crawler.crawl(max_images=num_images_test, from_page=from_page, page_size=page_size)]
        self.dump_test_images(list_json_images)

    def dump_test_images(self, list_images):
        with open(self._path, "wb") as f:
            pickle.dump(list_images, f)
            self.logger.info("Saved {0} images for testing in {1}".format(len(list_images), self._path))

    def push_test(self, amqp_url="amqp://*****:*****@180.0.0.3:5672", exchange="dofinder", queue="test", route_key="images.test"):
        publisher = PublisherRabbit(amqp_url, exchange=exchange, queue=queue, route_key=route_key)
        publisher.run(images_generator_function=self.generator_images_test())

    def generator_images_test(self):
        try:
            with open(self._path, "rb") as f:
                list_images = pickle.load(f)
                self.logger.info("Read  {1} images for testing in file".format(len(list_images), self._path))
                for image in list_images:
                    yield json.dumps(json.loads(image))
        except FileNotFoundError:
            logger.exception(" Error open file " + path_name_file + ". \n Try [ build test ] command before")
            raise
        except Exception:
            logger.exception("unexpected Exception")
            raise

    def pull_officials(self):
        # TODO excpetion raise for the connection to docker hub
        # download all the official library
        images_libraries = self.client_hub.crawl_official_images()
        self.logger.info("[" + str(len(images_libraries)) + "] number of official images to pull...")
        for image in images_libraries:
            try:
                self.client_daemon.pull(image)
            except docker.errors.APIError:
                self.logger.exception("Docker api error")


    def remove_no_officials(self):
        images_libraries = self.client_hub.crawl_official_images()
        all_images = self.client_daemon.images()

        for image in all_images:
            image_tags = image['RepoTags']
            for repo_tag in image_tags:   #repo_tag = "repo_name:latest"
                name = repo_tag.split(":")[0]
                if name not in images_libraries:
                    self.logger.info("Removing  " + repo_tag)
                    self.client_daemon.remove_image(repo_tag, force=True)
Exemplo n.º 2
0
    def __init__(self,
                 images_url="http://127.0.0.1:3000/api/images",
                 hub_url="https://hub.docker.com/",
                 amqp_url='amqp://*****:*****@127.0.0.1:5672',
                 exchange="dofinder",
                 queue="images",
                 route_key="images.scan",
                 path_file_logging=None):

        # For publishing into RabbitMq queue the iamge name
        self.url_amqp = amqp_url
        self._exchange = exchange
        self._route_key = route_key

        # stream handler logger

        self.logger = logging.getLogger(__class__.__name__)
        self.logger.info(__class__.__name__ + " logger  initialized")

        # file handler logger
        self.path_file_logging = path_file_logging
        self.file_logger = None
        if path_file_logging:
            name_file_logger = __class__.__name__ + "-rotated"
            self.file_logger = logging.getLogger(name_file_logger)

            self.file_logger.setLevel(logging.DEBUG)

            interval = 24
            backupCount = 10  # 10 giorni di backup
            self.logger.info("LOGGING PATH: " + path_file_logging +
                             " every hour=" + str(interval) +
                             " with backupcount=" + str(backupCount))

            handler = TimedRotatingFileHandler(path_file_logging,
                                               when="h",
                                               interval=interval,
                                               backupCount=backupCount)

            #fh = logging.FileHandler(path_file_logging)
            # fh.setLevel(logging.DEBUG)
            LOG_FORMAT = ('%(asctime)s %(message)s')
            formatter = logging.Formatter(LOG_FORMAT)
            # fh.setFormatter(formatter)
            # add the file handlers handlers to the logger
            # self.file_logger.addHandler(fh)
            handler.setLevel(logging.INFO)
            handler.setFormatter(formatter)
            self.file_logger.addHandler(handler)
            #str(tot_hub_images)+":"+str(tot_dockerfinder_images)+":"+ str(removed)+":"+str(pending)+":"+str(uptodate)
            self.file_logger.info(
                "hubtot:dftot:dfremoved:dfpending:dfuptodate")

        # client of Images Service:  in order to add and update the image
        # description.
        self.client_images = ClientImages(images_url=images_url)

        # client of Docker Hub.
        self.client_hub = ClientHub(docker_hub_endpoint=hub_url)
Exemplo n.º 3
0
 def __init__(self, path_file_images="images.test",  hub_url="https://hub.docker.com/"):
     self._path = path_file_images
     self.crawler = Crawler()
     self.logger = logging.getLogger(__class__.__name__)
     self.logger.info(__class__.__name__ + " logger  initialized")
     # the client hub interacts with the docker Hub registry
     self.client_hub = ClientHub(docker_hub_endpoint=hub_url,path_last_url=None )
     self.client_daemon = docker.Client(base_url='unix://var/run/docker.sock')
Exemplo n.º 4
0
    def __init__(self,
                 exchange="dofinder",
                 queue="images",
                 route_key="images.scan",
                 amqp_url='amqp://*****:*****@127.0.0.1:5672',
                 images_url="http://127.0.0.1:3000/api/images",
                 hub_url="https://hub.docker.com",
                 path_last_url="/data/crawler/lasturl.txt",
                 policy="none",
                 min_stars=0,
                 min_pulls=0,
                 only_automated=False,
                 only_official=False):

        self.logger = logging.getLogger(__class__.__name__)
        self.logger.info(__class__.__name__ + " logger  initialized")

        # publish the images downloaded into the rabbitMQ server.
        self.publisher = PublisherRabbit(amqp_url,
                                         exchange=exchange,
                                         queue=queue,
                                         route_key=route_key)
        self.logger.info("RabbitMQ : exchange=" + exchange + ", queue=" +
                         queue + " route key=" + route_key)

        # Client of Docker Hub.
        self.client_hub = ClientHub(docker_hub_endpoint=hub_url,
                                    path_last_url=path_last_url)

        # client of Images Service:  if an image is NEW it is sent to queue,
        # otherwise it is discarded
        self.client_images = ClientImages(images_url=images_url)

        # ordering = {"stars":"star_count", "-stars":"-star_count",
        # "pulls":"pull_count", "-pulls":"-pull_count"}
        ordi = {
            "stars_first": "-star_count",
            "pulls_first": "-pull_count",
            "none": None
        }
        # {"stars_first":"-star_count", "pulls_first"  : "-pull_count",
        #  "none":None}.get(policy, None)
        self.ordering = ordi[policy]
        self.policy = policy

        self.min_stars = min_stars
        self.min_pulls = min_pulls
        self.only_automated = only_automated
        self.only_official = only_official
        self.logger.info(
            "Crawler: ordering={}, policy={}, min_stars={}, min_pulls={}, only_official={}, only_automated={}"
            .format(self.ordering, self.policy, min_stars, min_pulls,
                    only_official, only_automated))
Exemplo n.º 5
0
class Checker:
    def __init__(self,
                 images_url="http://127.0.0.1:3000/api/images",
                 hub_url="https://hub.docker.com/",
                 amqp_url='amqp://*****:*****@127.0.0.1:5672',
                 exchange="dofinder",
                 queue="images",
                 route_key="images.scan",
                 path_file_logging=None):

        # For publishing into RabbitMq queue the iamge name
        self.url_amqp = amqp_url
        self._exchange = exchange
        self._route_key = route_key

        # stream handler logger

        self.logger = logging.getLogger(__class__.__name__)
        self.logger.info(__class__.__name__ + " logger  initialized")

        # file handler logger
        self.path_file_logging = path_file_logging
        self.file_logger = None
        if path_file_logging:
            name_file_logger = __class__.__name__ + "-rotated"
            self.file_logger = logging.getLogger(name_file_logger)

            self.file_logger.setLevel(logging.DEBUG)

            interval = 24
            backupCount = 10  # 10 giorni di backup
            self.logger.info("LOGGING PATH: " + path_file_logging +
                             " every hour=" + str(interval) +
                             " with backupcount=" + str(backupCount))

            handler = TimedRotatingFileHandler(path_file_logging,
                                               when="h",
                                               interval=interval,
                                               backupCount=backupCount)

            #fh = logging.FileHandler(path_file_logging)
            # fh.setLevel(logging.DEBUG)
            LOG_FORMAT = ('%(asctime)s %(message)s')
            formatter = logging.Formatter(LOG_FORMAT)
            # fh.setFormatter(formatter)
            # add the file handlers handlers to the logger
            # self.file_logger.addHandler(fh)
            handler.setLevel(logging.INFO)
            handler.setFormatter(formatter)
            self.file_logger.addHandler(handler)
            #str(tot_hub_images)+":"+str(tot_dockerfinder_images)+":"+ str(removed)+":"+str(pending)+":"+str(uptodate)
            self.file_logger.info(
                "hubtot:dftot:dfremoved:dfpending:dfuptodate")

        # client of Images Service:  in order to add and update the image
        # description.
        self.client_images = ClientImages(images_url=images_url)

        # client of Docker Hub.
        self.client_hub = ClientHub(docker_hub_endpoint=hub_url)

    def send_to_rabbitmq(self, msg):
        connection = pika.BlockingConnection(pika.URLParameters(self.url_amqp))
        self.logger.info("connected to " + self.url_amqp)
        # Open the channel
        channel = connection.channel()
        # Declare the queue
        self.logger.info(self._route_key)
        channel.basic_publish(exchange=self._exchange,
                              routing_key=self._route_key,
                              body=msg)

        self.logger.info(msg + " sent to " + self._exchange)

        connection.close()

    def check_images(self):
        """
        scan the images.
        """
        checked = {}
        tot_dockerfinder_images = 0
        removed = 0
        pending = 0
        uptodate = 0

        json_res = self.client_images.get_images()
        tot_dockerfinder_images = json_res['count']

        self.logger.info(
            str(tot_dockerfinder_images) +
            " images present into local database")
        tot_hub_images = self.client_hub.count_all_images()
        self.logger.info(
            str(tot_hub_images) + ": images present into Docker Hub")
        images = json_res['images']
        for image in images:
            name = image['name']
            #splitname = image['name'].split(":")
            repo = image['repo_name']  # splitname[0]
            tag = image['tag']  # splitname[1]
            image_id = image['_id']
            if self.client_hub.is_alive_in_hub(repo, tag=tag):
                self.logger.debug("[" + name + "] status:" + image['status'])
                if image['status'] == "updated":
                    # str(datetime.datetime.min)
                    if self.client_images.must_scanned(
                            name,
                            image['last_updated'] if image['last_updated'] else
                            "2000-01-01T00:00:00.000Z"):
                        self.logger.debug("[" + name +
                                          "] must be scanned again.")
                        self.client_images.update_status(
                            image_id, "pending")  # Set status to Pending
                        self.logger.info("[" + name +
                                         "] from UPDATED to PENDING status.")
                        self.send_to_rabbitmq(
                            json.dumps({
                                "name": name,
                                "repo_name": repo,
                                "tag": tag
                            }))
                        self.logger.info("[" + name + "] requeud into queue.")
                        # checked['pending'].append(name)
                        pending += 1
                    else:
                        self.logger.info("[" + name +
                                         "] remains UPDATED status.")
                        uptodate += 1
                if image['status'] == "pending":
                    self.logger.info("[" + name + "] remains PENDING status")
                    pending += 1
            else:
                # the image is removed from the database if it is not present
                # into Docker Hub
                self.logger.info(
                    "[" + name +
                    "] deleted from local db because it is not alive in DockerHub"
                )
                self.client_images.delete_image(image_id)
                removed += 1
        if self.path_file_logging:
            self.file_logger.info(
                str(tot_hub_images) + ":" + str(tot_dockerfinder_images) +
                ":" + str(removed) + ":" + str(pending) + ":" + str(uptodate))
        assert tot_dockerfinder_images == (pending + uptodate + removed)
        self.logger.info("Removed=" + str(removed) + "; pending=" +
                         str(pending) + " up-to-date=" + str(uptodate))

    def verify_images(self):
        """
        Scan all the images in the local databse and fix the problems
           1) having the ".go" or "." version of a software.
           2) is_private = null,
           3) is_automated = nul
        by updating the boolean value from Doker Hub.

        """

        json_res = self.client_images.get_images()
        tot_dockerfinder_images = json_res['count']
        self.logger.info(
            str(tot_dockerfinder_images) +
            " images present into local database")
        images = json_res['images']
        updated = 0
        for image in images:
            name = image['name']
            splitname = image['name'].split(":")
            repo = splitname[0]
            tag = splitname[1]
            json_response = self.client_hub.get_json_repo(repo)
            if json_response:
                if "is_automated" in json_response:
                    image['is_automated'] = json_response['is_automated']

                if "is_private" in json_response:
                    image['is_private'] = json_response['is_private']
                softwares = image['softwares']
                self.logger.info("before: {0}".format(softwares))
                # [0-9]+[.][0-9]*[.0-9]

                softwares = [sw for sw in softwares if sw['ver'] != '.']
                softwares = [sw for sw in softwares if sw['ver'] != ".go"]
                # for sw in softwares:
                #    if ".go" in sw['ver'] or sw['ver'] == ".":
                #        self.logger.info("removing {0}:{1}".format(sw['software'], sw['ver']))
                #        softwares.remove(sw)
                self.logger.info("after: {0}".format(softwares))
                image['softwares'] = softwares

            # PUT the new image description of the image
            self.client_images.put_image(image)
            updated += 1
            self.logger.info(
                "UPDATED [" + name +
                "]. {0}/{1}".format(updated, tot_dockerfinder_images))

    def run(self, interval_next_check):
        self.logger.info("Starting the checker module...")
        while True:
            try:
                self.check_images()
                time.sleep(interval_next_check)
            except Exception as e:
                self.logger.error(str(e))
                self.logger.error("Waiting 5s and restarting.")
                time.sleep(5)
Exemplo n.º 6
0
class Crawler:
    def __init__(self,
                 exchange="dofinder",
                 queue="images",
                 route_key="images.scan",
                 amqp_url='amqp://*****:*****@127.0.0.1:5672',
                 images_url="http://127.0.0.1:3000/api/images",
                 hub_url="https://hub.docker.com",
                 path_last_url="/data/crawler/lasturl.txt",
                 policy="none",
                 min_stars=0,
                 min_pulls=0,
                 only_automated=False,
                 only_official=False):

        self.logger = logging.getLogger(__class__.__name__)
        self.logger.info(__class__.__name__ + " logger  initialized")

        # publish the images downloaded into the rabbitMQ server.
        self.publisher = PublisherRabbit(amqp_url,
                                         exchange=exchange,
                                         queue=queue,
                                         route_key=route_key)
        self.logger.info("RabbitMQ : exchange=" + exchange + ", queue=" +
                         queue + " route key=" + route_key)

        # Client of Docker Hub.
        self.client_hub = ClientHub(docker_hub_endpoint=hub_url,
                                    path_last_url=path_last_url)

        # client of Images Service:  if an image is NEW it is sent to queue,
        # otherwise it is discarded
        self.client_images = ClientImages(images_url=images_url)

        # ordering = {"stars":"star_count", "-stars":"-star_count",
        # "pulls":"pull_count", "-pulls":"-pull_count"}
        ordi = {
            "stars_first": "-star_count",
            "pulls_first": "-pull_count",
            "none": None
        }
        # {"stars_first":"-star_count", "pulls_first"  : "-pull_count",
        #  "none":None}.get(policy, None)
        self.ordering = ordi[policy]
        self.policy = policy

        self.min_stars = min_stars
        self.min_pulls = min_pulls
        self.only_automated = only_automated
        self.only_official = only_official
        self.logger.info(
            "Crawler: ordering={}, policy={}, min_stars={}, min_pulls={}, only_official={}, only_automated={}"
            .format(self.ordering, self.policy, min_stars, min_pulls,
                    only_official, only_automated))

    # , max_images=None):
    def run(self,
            from_page,
            page_size,
            num_samples=None,
            at_random=False,
            force_from_page=False):
        """
        Starts the publisher of the RabbitMQ server, and send to the images crawled with the crawl() method.
        If num_sample != None:
            if at_random:
                crawls num_samples images using the random sampling method.
            else:
                crawl num_samples images in order
        else :
            crawls all the images from the Docker Hub.,

        :param from_page:  the starting page into the Docker Hub.
        :param page_size:  the number of images in a single page.
        :param max_images:  the number of images name to downloads.
        :return:
        """
        try:
            if num_samples != None:
                if at_random:
                    #self.publisher.run(images_generator_function=self.crawl(from_page=from_page, page_size=page_size, max_images=max_images))
                    self.publisher.run(
                        images_generator_function=self.crawl_random_samples(
                            num_samples,
                            force_from_page,
                            from_page=from_page,
                            page_size=page_size))  # , max_images=max_images))
                else:
                    self.publisher.run(images_generator_function=self.crawl(
                        force_from_page=force_from_page,
                        from_page=from_page,
                        page_size=page_size,
                        max_images=num_samples))  # , max_images=max_images))
            else:
                self.publisher.run(images_generator_function=self.crawl(
                    force_from_page=force_from_page,
                    from_page=from_page,
                    page_size=page_size,
                    max_images=None))
        except KeyboardInterrupt:
            self.publisher.stop()

    def crawl_random_samples(self, m_samples, force_from_page, from_page,
                             page_size):
        """
        This is a generator function that crawls docker images name at random name the Docker HUb.
        The following random sampling of a kNOWN STREAM is used.

            s = 0                //number of item selected
             for (j=1 ; j <= n; j++)
               p = Rand(0,1)
               if (p <= (m-s)/ n-j+1):
                  select S[j];
                  s++
        :param m_samples: number of sampled images,
        :param from_page:  the starting page into the Docker Hub.
        :param page_size:  is the number of images per image that Docker Hub return.
        :param max_images:  the number of images to download.
        :return: generator of JSON images description
        """

        # TODO : max_images and num_samples are different . Max _images tell to Docker Hub CLient the max number of images to be dowlodes
        # num_samples is the number of images to be sampled into Docker Hub.

        sent_images = 0
        max_images = None
        previous_num_sampled = 0  # only for logging the sampled images when the number cheange
        j = 0  # number of total imags passed thorugh the stream
        # total number of images stored within Docker Hub
        num_images = self.client_hub.count_all_images()
        # TODO: not all the images into the Dokcer Hub are downaloded if the
        # filter function is executed
        self.logger.info("Random sampling activated. \n\t\tTarget samples:" +
                         str(m_samples) + ", Total number of images: " +
                         str(num_images) + "\n\t\tPercentage:" +
                         str(m_samples / num_images))
        for list_images in self.client_hub.crawl_images(
                from_page=from_page,
                page_size=page_size,
                max_images=num_images,
                force_from_page=force_from_page,
                sort=self.ordering):
            # filter_images=self.filter_tag_latest):
            previous_num_sampled = sent_images  # set the previous sent images
            for image in list_images:
                # Random sampling over a stream of images
                j += 1
                # if j <= num_images : # otherwise division by zero
                p = random.uniform(0, 1)  # 0 <= p <= 1
                # if (p <= (m-s)/ n-j+1):
                if p <= (m_samples - sent_images) / (num_images - j + 1):
                    repo_name = image['repo_name']
                    sent_images += 1
                    yield json.dumps({"name": repo_name})
            if sent_images > previous_num_sampled:
                self.logger.info(
                    "{0}/{1} (Current samples/Target samples)".format(
                        str(sent_images), str(m_samples)))

        self.logger.info("Total sampled images: {0}".format(str(sent_images)))

    def crawl(self, force_from_page, from_page, page_size, max_images=None):
        """
        The crawl() is a generator function. It crawls the docker images name from the Docker HUb.
        It yeld a  JSON of the image.
        :param from_page:  the starting page into the Docker Hub.
        :param page_size:  is the number of images per image that Docker Hub return.
        :param max_images:  the number of images to download.
        :return: generator of JSON images description
        """
        sent_images = 0
        #count = self.client_hub.count_all_images()
        #max_images = count if not max_images else max_images
        count = self.client_hub.count_all_images()
        if max_images is None:
            max_images = count
            self.logger.info("Consecutive sampling activated. \n\t\tTarget :" +
                             str(max_images) + ", Total images: " +
                             str(count) + "\n\t\tPercentage:" +
                             str(max_images / count))
        else:
            self.logger.info("Consecutive sampling activated. \n\t\tTarget :" +
                             str(max_images) + ". Total images: " +
                             str(count) + "\n\t\tPercentage:" +
                             str(max_images / count))
        for image in self.client_hub.crawl_images(
                from_page=from_page,
                page_size=page_size,
                max_images=max_images,
                force_from_page=force_from_page,
                sort=self.ordering,
                # filter_image_tag=self.filter_tag
                # filter_tag=self.filter_latest,
                filter_repo=self.filter_tosker):
            sent_images += 1
            if sent_images % 100 == 0:
                self.logger.info(
                    "{0} number of images sent to analyser".format(
                        sent_images))
            yield json.dumps(image)  # json.dumps({"name": repo_name})

            #self.logger.info("{0}/{1} (Current samples/Target samples)".format(str(sent_images), str(count)))
            #self.logger.info("Number of images sent to queue: {0}".format(str(sent_images)))
            #self.logger.info("{0}/{1} (Current samples/Target samples)".format(str(sent_images), str(count)))
        self.logger.info(
            "Total num of images sent to queue: {0}".format(sent_images))

    def filter_latest(self, image_with_tag):
        """
        Filters the images with the *latest* tag.
        An image is sento to the rabbitMQ only of it is new into the local database, otherwise it is
        discarded (The checker is in charge to requeue the images that are not updated)
        :param repo_name: the name of a repository
        :return: True if the image must be downloaded, Flase if must be discarded
        """
        process_image = False
        self.logger.debug("[" + image_with_tag['tag'] +
                          "] filtering lates tag processing image.")

        return image_with_tag['tag'] == "latest"

    def filter_tosker(self, image):
        # self.logger.info(dir(self))

        # self.policy  = policy
        # self.min_stars =  min_stars
        # self.min_pulls =  min_pulls
        # self.only_automated =  only_automated
        # self.only_official  = only_official

        select_image = True
        stars = image['star_count']
        pulls = image['pull_count']
        is_automated = image['is_automated']
        is_official = image['is_official']

        # self.logger.info(self.only_automated)
        # self.logger.info(self.only_official)

        if stars < self.min_stars:
            #self.logger.info("stars {} {}"  .format(stars,self.min_stars )  )
            select_image = False
        if pulls < self.min_pulls:
            #self.logger.info("pulls {} {}"  .format(stars,self.min_stars )  )
            select_image = False
        if self.only_automated == True:
            if is_automated == False:
                self.logger.debug("not automated  ")
                select_image = False
        if self.only_official == True:
            if is_official == False:  # True:
                self.logger.debug("not official ")
                select_image = False
        return select_image