Пример #1
0
 def __init__(self):
     self.axdb_client = AxdbClient()
     self.axops_client = AxopsClient()
     self.axsys_client = AxsysClient()
     self.redis_client = RedisClient('redis', db=DB_REPORTING)
     self.event_notification_client = EventNotificationClient(FACILITY_GATEWAY)
     self.scm_clients = {
         ScmVendors.BITBUCKET: BitBucketClient(),
         ScmVendors.GITHUB: GitHubClient(),
         ScmVendors.GITLAB: GitLabClient()
     }
     self.repo_manager = RepoManager(DEFAULT_CONCURRENCY, DEFAULT_INTERVAL)
     self.event_trigger = EventTrigger()
Пример #2
0
    def __init__(self, cluster_name_id, region=None, profile=None):
        self.cluster_name_id = cluster_name_id

        # Region and profile info can be passed in with upgrade code path,
        # when this is run from axclustermanager outside cluster.
        self.region = AWSMetaData().get_region() if region is None else region
        self.profile = profile
        if profile is None:
            session = boto3.Session(region_name=self.region)
        else:
            session = boto3.Session(region_name=self.region,
                                    profile_name=profile)

        self.ec2 = session.resource('ec2')
        self.client = session.client('ec2')
        self.cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id,
                                          aws_profile=profile)
        self.cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id,
                                              aws_profile=profile)
        cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self.s3_bucket = cluster_config_path.bucket()
        self.s3_config_prefix = cluster_config_path.master_config_dir()
        self.s3_attributes_path = cluster_config_path.master_attributes_path()
        self.s3_user_data_path = cluster_config_path.master_user_data_path()

        logger.info(
            "Create MasterManager in region %s, attributes path: %s and user_data_path: %s",
            self.region, self.s3_attributes_path, self.s3_user_data_path)

        # The EC2 instance object for the current master
        self.master_instance = None

        # Properties/attributes to use when launching the new master
        self.attributes = {}

        # For upgrades.
        # The following values are set to None from master manager but to not None from upgrade code.
        self.aws_image = None
        self.instance_profile = None

        self.event_notification_client = EventNotificationClient(
            FACILITY_PLATFORM)
Пример #3
0
class BaseEventTranslator(object):
    """Base event translator."""

    parser = CommandParser()
    event_notification_client = EventNotificationClient(FACILITY_GATEWAY)

    @classmethod
    def _parse_command(cls, texts):
        """Extract command out of text.

        Currently, we support the following commands:
        (1) /ax rerun                                                           # Rerun failed service templates (subject to policy enforcement)
        (2) /ax rerun -a/--all                                                  # Rerun all service templates (subject to policy enforcement)
        (3) /ax run "AX Workflow Test"                                          # Run a specific service template
        (4) /ax run "AX Workflow Test" -p/--param namespace=staging             # Run a specific service template by supplying some parameters

        :param texts:
        :return:
        """
        texts = texts.splitlines()
        commands = []
        for i in range(len(texts)):
            text = texts[i].strip()
            if not text.startswith('/ax '):  # Not a command
                continue
            try:
                args = cls.parser.parse(text)
            except InvalidCommand as e:
                logger.warning('Failed to parse command: %s', e)
                cls.event_notification_client.send_message_to_notification_center(
                    CODE_JOB_CI_INVALID_COMMAND, detail={'command': text})
                continue
            else:
                command = {'command': args['command']}
                if command['command'] == AxCommands.RUN:
                    command['template'] = args['template']
                    command['parameters'] = {}
                    if args['parameters']:
                        for param in args['parameters']:
                            key, value = param.split('=')
                            command['parameters'][key] = value
                else:
                    command['rerun_all'] = args['rerun_all']
                commands.append(command)
        return commands
Пример #4
0
class AXMasterManager:
    def __init__(self, cluster_name_id, region=None, profile=None):
        self.cluster_name_id = cluster_name_id

        # Region and profile info can be passed in with upgrade code path,
        # when this is run from axclustermanager outside cluster.
        self.region = AWSMetaData().get_region() if region is None else region
        self.profile = profile
        if profile is None:
            session = boto3.Session(region_name=self.region)
        else:
            session = boto3.Session(region_name=self.region, profile_name=profile)

        self.ec2 = session.resource('ec2')
        self.client = session.client('ec2')
        self.cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=profile)
        self.cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id, aws_profile=profile)
        cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self.s3_bucket = cluster_config_path.bucket()
        self.s3_config_prefix = cluster_config_path.master_config_dir()
        self.s3_attributes_path = cluster_config_path.master_attributes_path()
        self.s3_user_data_path = cluster_config_path.master_user_data_path()

        logger.info("Create MasterManager in region %s, attributes path: %s and user_data_path: %s", self.region,
                    self.s3_attributes_path, self.s3_user_data_path)

        # The EC2 instance object for the current master
        self.master_instance = None

        # Properties/attributes to use when launching the new master
        self.attributes = {}

        # For upgrades.
        # The following values are set to None from master manager but to not None from upgrade code.
        self.aws_image = None
        self.instance_profile = None

        self.event_notification_client = EventNotificationClient(FACILITY_PLATFORM)

    @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def discover_master(self, state=None):
        """
        Discover's the currently running master for the given cluster name.
        """
        if not state:
            state = [EC2InstanceState.Running]
        response = self.client.describe_instances(
            Filters=[
                {'Name': 'tag:Name', 'Values': [self.cluster_name_id + '-master']},
                {'Name': 'instance-state-name', 'Values': state}
            ]
        )
        # Confirm that there is only 1 master
        if len(response['Reservations']) == 0:
            logger.info("Master with state %s not found", state)
            return None

        assert len(response['Reservations']) == 1, "More than 1 master running (reservations != 1)!"
        assert len(response['Reservations'][0]['Instances']) == 1, "Not exactly 1 master instance is running! {}".format(response['Reservations'][0]['Instances'])
        return response['Reservations'][0]['Instances'][0]['InstanceId']

    def user_data_fixup(self, user_data):
        """
        The original user-data used for creating the master can become obsolete after upgrades. There are 5
        fields from the original user-data that need to be "fixed". The SERVER_BINARY_TAR_URL, SALT_TAR_URL,
        SERVER_BINARY_TAR_HASH, SALT_TAR_HASH and the wget command that downloads the bootstrap-script.
        """
        from ax.platform.kube_env_config import kube_env_update
        # TODO: It's not ideal to use env variables for passing arguments.
        # Env variables could be different between running as server and from upgrade.
        kube_version = os.getenv('KUBE_VERSION', os.getenv('NEW_KUBE_VERSION')).strip()
        cluster_install_version = os.getenv('AX_CLUSTER_INSTALL_VERSION', os.getenv('NEW_CLUSTER_INSTALL_VERSION')).strip()
        server_binary_tar_hash = os.getenv('SERVER_BINARY_TAR_HASH', os.getenv('NEW_KUBE_SERVER_SHA1')).strip()
        salt_tar_hash = os.getenv('SALT_TAR_HASH', os.getenv('NEW_KUBE_SALT_SHA1')).strip()
        updates = {
            "new_kube_version": kube_version,
            "new_cluster_install_version": cluster_install_version,
            "new_kube_server_hash": server_binary_tar_hash,
            "new_kube_salt_hash": salt_tar_hash,
            "new_api_servers": self.attributes['private_ip_address'],
        }
        dec = zlib.decompressobj(32 + zlib.MAX_WBITS)  # offset 32 to skip the header
        unzipped_user_data = dec.decompress(base64.b64decode(user_data))

        # Zip output buffer. For details: http://bit.ly/2gv3WKt
        comp = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
        zipped_data = comp.compress(kube_env_update(unzipped_user_data, updates)) + comp.flush()

        # Convert output to base64 encoded
        logger.info("User data fixup completed")
        return base64.b64encode(zipped_data)

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def get_user_data(self):
        """
        Get's the user-data for the current master. Note that the user-data is base64 encoded when it is
        downloaded. Writes the data into a file.
        """
        # The user-data is base64 encoded.
        user_data = self.client.describe_instance_attribute(
            Attribute='userData', InstanceId=self.master_instance.instance_id)['UserData']['Value']
        # Download user-data and store it into a temporary file. This data is base64 encoded.
        # It is better to use a well-known location for this file rather than one generated by mkstemp (or variants).
        # That way, this file could be populated the first time this pod run or even later by simply downloading
        # the user-data from S3.
        try:
            user_data = self.user_data_fixup(user_data)
        except Exception as e:
            logger.exception("Failed while fixing up user-data")
            raise AXPlatformException("Failed while fixing up user-data: " + str(e))

        with open(USER_DATA_FILE_NEW, "w") as f:
            f.write(user_data)
        return USER_DATA_FILE_NEW

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def get_master_pd_volume_metadata(self):
        """
        Get's the metadata for the Master's persistent disk (EBS volume).
        """
        volume_metadata = self.client.describe_volumes(
            Filters=[
                {'Name': 'attachment.instance-id', 'Values': [self.master_instance.instance_id,]},
                {'Name': 'tag:Name', 'Values' : [self.cluster_name_id + "-master-pd"]}
            ])
        assert volume_metadata is not None, "Failed to retries volume_metadata"
        return volume_metadata

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def get_route_table_id(self):
        """
        Get's the route table used by the given cluster.
        """
        route_table_id = None
        response = self.client.describe_route_tables(
            Filters=[{'Name': 'tag:KubernetesCluster', 'Values': [self.cluster_name_id]}])
        assert len(response["RouteTables"]) == 1, "There should be a single route-table!"
        assert "RouteTableId" in response["RouteTables"][0], "RouteTableId not in response"

        route_table_id = response["RouteTables"][0]["RouteTableId"]
        logger.debug("Using route-table-id %s", route_table_id)
        return route_table_id

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def get_root_dev_attrs(self):
        assert self.master_instance, "Master instance not set"
        root_dev_id = None
        for dev in self.master_instance.block_device_mappings:
            if dev['DeviceName'] == self.master_instance.root_device_name:
                root_dev_id = dev['Ebs']['VolumeId']

                dev_metadata = self.client.describe_volumes(VolumeIds=[root_dev_id])
                root_dev_size = dev_metadata['Volumes'][0]['Size']
                root_dev_type = dev_metadata['Volumes'][0]['VolumeType']

                break

        assert self.master_instance.root_device_name and str(root_dev_size) and root_dev_type, "Failed to get root device attributes"
        logger.info("Root device attributes: %s, %s, %s", self.master_instance.root_device_name, str(root_dev_size), root_dev_type)
        self.attributes['root_dev_name'] = self.master_instance.root_device_name
        self.attributes['root_dev_size'] = str(root_dev_size)
        self.attributes['root_dev_type'] = root_dev_type

    def populate_attributes(self):
        """
        Collects attributes that will be persisted and used for spinning up the new master instance.
        Populates the "attributes" member dict with all the values.
        """
        # Upgrade might overwrite these attributes. Use them if set.
        # Otherwise get them from existing master instance.
        image_id = self.aws_image if self.aws_image else self.master_instance.image_id
        instance_profile = self.instance_profile if self.instance_profile else self.master_instance.iam_instance_profile["Arn"]
        self.attributes['image_id'] = image_id
        self.attributes['instance_type'] = self.master_instance.instance_type
        self.attributes['vpc_id'] = self.master_instance.vpc_id
        self.attributes['key_name'] = self.master_instance.key_name
        self.attributes['placement'] = self.master_instance.placement
        self.attributes['arn'] = instance_profile
        self.attributes['subnet_id'] = self.master_instance.subnet_id
        self.attributes['private_ip_address'] = self.master_instance.private_ip_address
        target_sgs = []
        for sg in self.master_instance.security_groups:
            target_sgs.append(sg["GroupId"])
        self.attributes['security_group_ids'] = target_sgs
        self.attributes['user_data_file'] = self.get_user_data()
        self.attributes['master_tags'] = self.master_instance.tags

        # Retrieve master-pd and master-eip from the volume_metadata
        volume_metadata = self.get_master_pd_volume_metadata()
        if volume_metadata['Volumes'] and volume_metadata['Volumes'][0]:
            if volume_metadata['Volumes'][0]['VolumeId']:
                vol_id = volume_metadata['Volumes'][0]['VolumeId']
                self.attributes['master_pd_id'] = vol_id
                self.attributes['master_pd_device'] = volume_metadata['Volumes'][0]['Attachments'][0]['Device']

            # Retrieve tags of master-pd. Get EIP from master.
            for tag in volume_metadata['Volumes'][0]['Tags']:
                if tag['Key'] == "kubernetes.io/master-ip":
                    master_eip = tag["Value"]
                    self.attributes['master_eip'] = master_eip
                    break
        assert self.attributes['master_pd_id'] is not None, "Failed to find Master's persistent disk"
        assert self.attributes['master_pd_device'] is not None, "Failed to find attachment info for Master's persistent disk"
        assert self.attributes['master_eip'] is not None, "Failed to find Master's Elastic IP"

        self.attributes['route_table_id'] = self.get_route_table_id()
        self.attributes['pod_cidr'] = self.cluster_config.get_master_pod_cidr()
        self.attributes['ebs_optimized'] = self.master_instance.ebs_optimized

        # Get root device attributes
        self.get_root_dev_attrs()

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def add_tags(self, instance):
        """
        Adds tags to the new master instance.

        :param instance: The new master ec2 instance.
        """
        response = self.client.create_tags(
            Resources=[instance.instance_id],
            Tags=self.attributes['master_tags']
        )

        logger.info("Attached tags to new master")

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def attach_eip(self, instance):
        """
        Attaches the EIP to the master instance.

        :param instance: The new master ec2 instance.
        """
        eip_meta = self.client.describe_addresses(PublicIps=[self.attributes['master_eip']])
        assert eip_meta is not None, "Failed to get details about EIP " + self.attributes['master_eip']
        assert eip_meta['Addresses'] and len(eip_meta['Addresses']) == 1, "Error getting EIP address details"
        response = self.client.associate_address(
            InstanceId=instance.instance_id,
            AllocationId=eip_meta['Addresses'][0]['AllocationId'],
            AllowReassociation=True
        )
        logger.info("Attached EIP to new master: %s", response['ResponseMetadata']['HTTPStatusCode'])

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def attach_volume(self, instance):
        """
        Attaches the EBS volume to the master instance.

        :param instance: The new master ec2 instance.
        """
        response = instance.attach_volume(
            VolumeId=self.attributes['master_pd_id'],
            Device=self.attributes['master_pd_device'])
        logger.info("Attached volume to new master: %s", response['ResponseMetadata']['HTTPStatusCode'])

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def replace_route(self, instance):
        response = self.client.replace_route(
            RouteTableId=self.attributes['route_table_id'],
            DestinationCidrBlock=self.attributes['pod_cidr'],
            InstanceId=instance.instance_id
        )
        logger.info("Replaced master route %s with %s: %s", self.attributes['pod_cidr'], instance.instance_id, response['ResponseMetadata']['HTTPStatusCode'])

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5)
    def run_new_master(self, user_data):
        """
        Uses the boto APIs to run new instances of the master. Retries in case of failure.

        :param user_data: The user-data to use for the new instance.
        """
        try:
            response = self.client.run_instances(
                ImageId=self.attributes['image_id'],
                MinCount=1,
                MaxCount=1,
                KeyName=self.attributes['key_name'],
                UserData=user_data,
                InstanceType=self.attributes['instance_type'],
                Placement=self.attributes['placement'],
                IamInstanceProfile={"Arn": self.attributes['arn']},
                NetworkInterfaces=[
                    {
                    'DeviceIndex': 0,
                    'SubnetId': self.attributes['subnet_id'],
                    'PrivateIpAddress': self.attributes['private_ip_address'],
                    'AssociatePublicIpAddress': True,
                    'Groups': self.attributes['security_group_ids']
                    },
                ],
                BlockDeviceMappings=[
                    {
                        'DeviceName': self.attributes['root_dev_name'],
                        'Ebs': {
                            'VolumeSize': int(self.attributes['root_dev_size']),
                            'VolumeType': self.attributes['root_dev_type']
                        }
                    },
                    # Ephemeral devices to match kube-up behavior to get SSD attached.
                    {
                        'DeviceName': '/dev/sdc',
                        'VirtualName': 'ephemeral0'
                    },
                    {
                        'DeviceName': '/dev/sdd',
                        'VirtualName': 'ephemeral1'
                    },
                    {
                        'DeviceName': '/dev/sde',
                        'VirtualName': 'ephemeral2'
                    },
                    {
                        'DeviceName': '/dev/sdf',
                        'VirtualName': 'ephemeral3'
                    },
                ],
                EbsOptimized=self.attributes['ebs_optimized']
            )
            return response
        except Exception as e:
            logger.exception("Error running instances: %s", str(e))

    def launch_new_master(self):
        """
        Launches the new master instance.
        """
        logger.info("Launching new master ...")
        # Read the base64 encoded data and decode it before using it. AWS will
        # base64 encode it again.
        with open(self.attributes['user_data_file'], 'r') as user_data_file:
            user_data = base64.b64decode(user_data_file.read())

        response = self.run_new_master(user_data)
        new_master_id = response["Instances"][0]['InstanceId']
        logger.info("Waiting for new master %s to start", new_master_id)
        new_master = self.ec2.Instance(new_master_id)

        # Each call to ec2_instance.wait_until_running below will wait for a max of 15 minutes.
        # Give enough time for the instance to start...
        counter = 0
        while (counter < 2):
            try:
                new_master.wait_until_running()
                counter = counter + 1
            except botocore.exceptions.WaiterError as we:
                logger.debug("Still waiting for new master to run...")
                pass

        logger.info("New master with instance id %s is up!", new_master.instance_id)

        self.add_tags(new_master)
        self.attach_eip(new_master)
        self.attach_volume(new_master)
        self.replace_route(new_master)

        return new_master

    @retry(wait_fixed=2000)
    def wait_for_termination(self):
        """
        Waits the termination of the currently running master instance.
        """
        # check if master api server is alive and if not terminate master
        try:
            logger.info("Checking if Master API server is alive...")
            self.check_master_api_server()
            logger.info("Master API server is alive...")
        except Exception as e:
            if isinstance(e, urllib3.exceptions.HTTPError):
                logger.error("Got the following exception while trying to check master api server {}".format(e))
                logger.info("Assuming master is bad and terminating it...")
                self.terminate_master()
                logger.info("Done terminating master")
                return
            else:
                logger.warn("Got the following error from Kubernetes Master API Server {}. Looks like it is alive so ignoring this temporary error".format(e))

        logger.debug("Waiting for master termination signal ...")
        self.master_instance.wait_until_terminated()
        logger.info("Master down!")

    @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=3, retry_on_exception=print_exception)
    def terminate_master(self):
        """
        Terminate current master instance and wait until it's done.
        """
        logger.info("Terminating master %s.", self.master_instance)
        self.client.terminate_instances(InstanceIds=[self.master_instance.instance_id])
        self.master_instance.wait_until_terminated()

    @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=3)
    def stop_master(self):
        stop_master_requested = False
        master_instance_id = self.discover_master(state=[EC2InstanceState.Stopping, EC2InstanceState.Stopped])
        if master_instance_id:
            stop_master_requested = True

        if not stop_master_requested:
            master_instance_id = self.discover_master(state=["*"])
            if not master_instance_id:
                raise AXPlatformException("Cannot find master instance")
            try:
                self.client.stop_instances(InstanceIds=[master_instance_id])
            except ClientError as ce:
                if "UnsupportedOperation" in str(ce) and "StopInstances" in str(ce):
                    logger.warning("Master instance %s a spot instance, which cannot be stopped.")
                    return
                elif "IncorrectInstanceState" in str(ce):
                    # Master could be in "terminating", "terminated", or "stopped" state. It does not
                    # make sense that first 2 states could kick in, unless there is some human intervention
                    # so the code will stuck in waiting for master to go into "stopped" state, which is
                    # a good indication for checking manually
                    pass
                else:
                    raise ce
        logger.info("Waiting for master %s to get into state \"stopped\"", master_instance_id)
        while True:
            stopped_master = self.discover_master(state=[EC2InstanceState.Stopped])
            if stopped_master:
                logger.info("Master %s successfully stopped", master_instance_id)
                return
            else:
                time.sleep(5)

    @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=3)
    def restart_master(self):
        started_master_id = self.discover_master(state=[EC2InstanceState.Running])
        if started_master_id:
            logger.info("Master %s is already running", started_master_id)
            return

        stopped_master_id = self.discover_master(state=[EC2InstanceState.Stopped])
        if not stopped_master_id:
            raise AXPlatformException("Cannot find a previously stopped master instance")

        # As we can always start a "stopped" instance, any other exception will be thrown out
        self.client.start_instances(InstanceIds=[stopped_master_id])

        logger.info("Waiting for master %s to get into state \"running\"", stopped_master_id)
        while True:
            running_master_id = self.discover_master(state=[EC2InstanceState.Running])
            if running_master_id:
                logger.info("Master %s successfully started", running_master_id)
                return
            else:
                time.sleep(5)

    def save_master_config(self, file_path):
        """
        Uploads the master attributes and user-data (in base64encoded format) into a directory
        in the s3 bucket.
        """
        with open(file_path, 'r') as user_data_file:
            user_data = user_data_file.read()
        self.cluster_info.upload_master_config_to_s3(self.attributes, user_data)

    def user_data_updated(self):
        """
        Get both old and new user data file content and compare them.
        Return True if they are different.
        """
        with open(USER_DATA_FILE_S3, "r") as f:
            old = f.read()
        with open(USER_DATA_FILE_NEW, "r") as f:
            new = f.read()
        return old != new

    def send_notification(self, code, message):
        try:
            self.event_notification_client.send_message_to_notification_center(
                code, detail={'message': "[master_manager] " + message})
        except Exception as exc:
            logger.exception("Failed to send event to notification center: %s", exc)
        return

    def run(self):
        """
        The main method for the MasterManager.
        """
        logger.info("Running the MasterManager!")
        attr_str = self.cluster_info.get_master_config(USER_DATA_FILE_S3)
        if attr_str is not None:
            self.attributes = json.loads(attr_str)
            self.attributes['user_data_file'] = USER_DATA_FILE_S3

        # Check if the master is running. Update the self.master_instance object.
        try:
            instance_id = self.discover_master()
            if instance_id is not None:
                self.master_instance = self.ec2.Instance(instance_id)
                logger.info("Master instance discovered: %s", self.master_instance.instance_id)

                # this will retry for a while and then throw an exception if master api server is unreachable
                self.check_master_api_server()

                if not self.attributes:
                    # This is needed only for first startup when cluster is created.
                    logger.debug("Populating attributes")
                    self.populate_attributes()
                    logger.debug("Saving master's config into S3")
                    self.save_master_config(USER_DATA_FILE_NEW)
                    logger.info("Master config uploaded to s3")
        except Exception as e:
            raise AXPlatformException("Failed to discover master: " + str(e))

        while(True):
            if self.master_instance is not None:
                self.wait_for_termination()
                message = "Master instance with id " + \
                    self.master_instance.instance_id + " terminated. A " + \
                    "new master instance will be created. This should " + \
                    "take a few minutes"
            else:
                logger.info("Master not running")
                message = "Master instance not found" + \
                    "A new master instance will be created. This should " + \
                    "take a few minutes."

            self.send_notification(CODE_PLATFORM_ERROR, message)
            new_master = self.launch_new_master()
            self.master_instance = self.ec2.Instance(new_master.instance_id)
            logger.info("New master instance %s running", self.master_instance.instance_id)
            self.send_notification(CODE_PLATFORM_CRITICAL, "New master " + \
                                   "instance with id {} started".format(
                                       self.master_instance.instance_id))
            logger.info("Wait for {} minutes before running checks...".format(WAIT_TIME_POST_RESTART_MIN))
            time.sleep(WAIT_TIME_POST_RESTART_MIN * const.SECONDS_PER_MINUTE)
            logger.info("Done waiting. Now back to checks")

    @retry_unless()
    def check_master_api_server(self):
            c = KubernetesApiClient()
            c.api.read_namespaced_service("default", "kubernetes")

    def upgrade(self):
        """
        Entry point for master upgrade.
        Support upgrade of:
            - Kubernetes versions;
            - AMI image;
            - Selected list of kube_env variables.
        """
        logger.info("Starting master upgrade!")
        ami_name = os.getenv("AX_AWS_IMAGE_NAME")
        assert ami_name, "Fail to detect AMI name from environ"
        ami_id = AMI(aws_region=self.region, aws_profile=self.profile).get_ami_id_from_name(ami_name=ami_name)
        logger.info("Using ami %s for new master", ami_id)

        s3_data = self.cluster_info.get_master_config(USER_DATA_FILE_S3)
        if s3_data is None:
            attr = None
        else:
            attr = json.loads(self.cluster_info.get_master_config(USER_DATA_FILE_S3))
        instance_id = self.discover_master()
        terminating = False
        launching = False
        if instance_id is None:
            # This is possible if previous upgrade fails after termination but before new master start.
            # Simply restart master in this case.
            # This could also happen when master crashes in the first place and upgrade is started.
            # We would use old config to start master and rerun upgrade again.
            logger.info("No running master. S3 attr %s.", USER_DATA_FILE_S3)
            assert attr is not None, "No master instance and no master config."
            self.attributes = attr
            self.attributes['user_data_file'] = USER_DATA_FILE_S3

            self.ensure_master_tags()
            self.save_master_config(USER_DATA_FILE_S3)
            launching = True
        else:
            self.master_instance = self.ec2.Instance(instance_id)
            logger.info("Running master %s.", instance_id)
            self.aws_image = ami_id
            self.instance_profile = AXClusterInstanceProfile(self.cluster_name_id, region_name=self.region,
                                                             aws_profile=self.profile).get_master_arn()
            self.populate_attributes()
            master_tag_updated = self.ensure_master_tags()
            # TODO: Possible race here.
            # If upgrade is interrupted after config saving but before master termination,
            # Next upgrade attempt would assume master is already upgraded.
            # Manually hack to terminate instance is needed then.
            if attr != self.attributes or self.user_data_updated() or master_tag_updated:
                self.save_master_config(USER_DATA_FILE_NEW)
                terminating = True
                launching = True

        if terminating:
            self.terminate_master()
            logger.info("Done terminating %s", instance_id)
        if launching:
            logger.info("Done launching %s", self.launch_new_master())

    def ensure_master_tags(self):
        """
        During upgrade, we need to ensure master has AXClusterNameID,AXCustomerID,AXTier tags (#23)
        :return: True if we updated master tags
        """
        for tag in self.attributes['master_tags']:
            if tag["Key"] == "AXTier":
                # Master has updated tags
                return False

        self.attributes['master_tags'] += [
            {
                "Key": "AXCustomerID",
                "Value": AXCustomerId().get_customer_id()
            },
            {
                "Key": "AXTier",
                "Value": "master"
            },
            {
                "Key": "AXClusterNameID",
                "Value": self.cluster_name_id
            },
        ]
        return True
Пример #5
0
class Gateway(object):
    """Repo Controller"""
    BASE_DIR = '/ax/data/repos'
    BRANCH_CACHE_TTL = 5 * 60  # 5 minutes TTL as we expect we won't finish upgrade within 5 minutes
    NAMESPACE = 'gateway'

    CLUSTER_NAME_ID = os.environ.get('AX_CLUSTER')
    CUSTOMER_ID = os.environ.get('AX_CUSTOMER_ID')
    S3_BUCKET_NAME = 'applatix-cluster-{account}-{seq}'.format(account=CUSTOMER_ID, seq=0)
    s3_bucket = boto3.resource('s3').Bucket(S3_BUCKET_NAME)

    def __init__(self):
        self.axdb_client = AxdbClient()
        self.axops_client = AxopsClient()
        self.axsys_client = AxsysClient()
        self.redis_client = RedisClient('redis', db=DB_REPORTING)
        self.event_notification_client = EventNotificationClient(FACILITY_GATEWAY)
        self.scm_clients = {
            ScmVendors.BITBUCKET: BitBucketClient(),
            ScmVendors.GITHUB: GitHubClient(),
            ScmVendors.GITLAB: GitLabClient()
        }
        self.repo_manager = RepoManager(DEFAULT_CONCURRENCY, DEFAULT_INTERVAL)
        self.event_trigger = EventTrigger()

    def get_repos(self, scm_type, url, username, password):
        """Get all repos owned by the user."""
        if scm_type in {ScmVendors.BITBUCKET, ScmVendors.GITHUB, ScmVendors.GITLAB}:
            try:
                repos = self.scm_clients[scm_type].get_repos(username, password)
            except Exception as e:
                logger.warning('Unable to connect to %s: %s', scm_type, e)
                detail = {
                    'type': scm_type,
                    'username': username,
                    'error': str(e.detail)
                }
                self.event_notification_client.send_message_to_notification_center(CODE_CONFIGURATION_SCM_CONNECTION_ERROR,
                                                                                   detail=detail)
                raise AXApiInvalidParam('Cannot connect to %s server' % scm_type)
            else:
                return repos
        elif scm_type == ScmVendors.GIT:
            _, vendor, repo_owner, repo_name = Gateway.parse_repo(url)
            path = '/tmp/{}/{}/{}'.format(vendor, repo_owner, repo_name)
            if os.path.isfile(path):
                os.remove(path)
            if os.path.isdir(path):
                shutil.rmtree(path)
            os.makedirs(path)
            client = GitClient(path=path, repo=url, username=username, password=password)
            try:
                client.list_remote()
            except Exception as e:
                logger.warning('Unable to connect to git server (%s): %s', url, e)
                detail = {
                    'type': scm_type,
                    'url': url,
                    'username': username,
                    'error': str(e)
                }
                self.event_notification_client.send_message_to_notification_center(CODE_CONFIGURATION_SCM_CONNECTION_ERROR,
                                                                                   detail=detail)
                raise AXApiInvalidParam('Cannot connect to git server')
            else:
                return {url: url}
        elif scm_type == ScmVendors.CODECOMMIT:
            repos = {}
            region = 'us-east-1'
            default_url_format = 'https://git-codecommit.{}.amazonaws.com/v1/repos/{}'
            client = boto3.client('codecommit', aws_access_key_id=username, aws_secret_access_key=password,
                                  region_name=region)
            try:
                response = client.list_repositories().get('repositories', [])
                for r in response:
                    repo_url = default_url_format.format(region, r['repositoryName'])
                    repos[repo_url] = repo_url
            except Exception as exc:
                detail = {
                    'type': scm_type,
                    'region': region,
                    'url': default_url_format.format(region, ''),
                    'username': username,
                    'error': 'Cannot connect to CodeCommit' + str(exc)
                }
                self.event_notification_client.send_message_to_notification_center(CODE_CONFIGURATION_SCM_CONNECTION_ERROR,
                                                                                   detail=detail)
                raise AXApiInvalidParam('Cannot connect to CodeCommit: %s' % exc)
            else:
                return repos
        else:
            return {}

    @staticmethod
    def parse_repo(repo):
        """Parse repo url into 4-tuple (protocol, vendor, repo_owner, repo_name).

        :param repo:
        :return:
        """
        parsed_url = urlparse(repo)
        protocol, vendor = parsed_url.scheme, parsed_url.hostname
        m = re.match(r'/([a-zA-Z0-9\-]+)/([a-zA-Z0-9_.\-/]+)', parsed_url.path)
        if not m:
            raise AXScmException('Illegal repo URL', detail='Illegal repo URL ({})'.format(repo))
        repo_owner, repo_name = m.groups()
        return protocol, vendor, repo_owner, repo_name

    def has_webhook(self, repo):
        """Test if there is any repo which uses webhook.

        :param repo:
        :return:
        """
        tools = self.axops_client.get_tools(category='scm')
        for i in range(len(tools)):
            use_webhook = tools[i].get('use_webhook', False)
            repos = set(tools[i].get('repos', []))
            repos -= {repo}
            if use_webhook and repos:
                return True
        return False

    def get_webhook(self, vendor, repo):
        """Get webhook

        :param vendor:
        :param repo:
        :returns:
        """
        logger.info('Retrieving webhook (repo: %s) ...', repo)
        return self.scm_clients[vendor].get_webhook(repo)

    def create_webhook(self, vendor, repo):
        """Create webhook

        :param vendor:
        :param repo:
        :returns:
        """

        @retry(wait_fixed=5000, stop_max_delay=20 * 60 * 1000)
        def _verify_elb(hostname):
            try:
                logger.info('Verifying ELB (%s) ...', hostname)
                ip = socket.gethostbyname(hostname)
                logger.info('Successfully resolved ELB (%s) to IP (%s)', hostname, ip)
            except Exception as e:
                logger.error('ELB not ready: %s', str(e))
                raise AXApiInternalError('ELB not ready', str(e))

        ip_range = self.scm_clients[vendor].get_webhook_whitelist()

        # Create ELB
        payload = {'ip_range': ip_range, 'external_port': 8443, 'internal_port': 8087}
        try:
            logger.info('Creating ELB for webhook ...')
            result = self.axsys_client.create_webhook(**payload)
        except Exception as e:
            logger.error('Failed to create ELB for webhook: %s', str(e))
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_ELB_CREATION_FAILURE,
                                                                               detail=payload)
            raise AXApiInternalError('Failed to create ELB for webhook', str(e))
        else:
            logger.info('Successfully created ELB for webhook')

        # Verify ELB
        hostname = result['hostname']
        try:
            _verify_elb(hostname)
        except Exception as e:
            logger.error('Timed out on waiting for ELB to be available: %s', str(e))
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_ELB_VERIFICATION_TIMEOUT,
                                                                               detail={'hostname': hostname})
            raise AXApiInternalError('Timed out on waiting for ELB to be available: %s' % str(e))

        # Create webhook
        try:
            logger.info('Creating webhook (repo: %s) ...', repo)
            self.scm_clients[vendor].create_webhook(repo)
        except AXApiAuthFailed as e:
            logger.error('Invalid credential supplied')
            detail = {
                'repo': repo,
                'error': 'Invalid credential supplied:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_CREATION_FAILURE,
                                                                               detail=detail)
            raise AXApiInvalidParam('User authentication failed', detail=str(e))
        except AXApiForbiddenReq as e:
            logger.error('Supplied credential is valid but having insufficient permission')
            detail = {
                'repo': repo,
                'error': 'Supplied credential is valid but having insufficient permission:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_CREATION_FAILURE,
                                                                               detail=detail)
            raise AXApiInvalidParam('User has insufficient permission', detail=str(e))
        except Exception as e:
            logger.error('Failed to configure webhook: %s', e)
            detail = {
                'repo': repo,
                'error': 'Failed to configure webhook:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_CREATION_FAILURE,
                                                                               detail=detail)
            raise AXApiInternalError('Failed to configure webhook', str(e))
        else:
            logger.info('Successfully created webhook (repo: %s)', repo)
            return {}

    def delete_webhook(self, vendor, repo):
        """Delete webhook

        :param vendor:
        :param repo:
        :returns:
        """
        # Delete webhook
        try:
            logger.info('Deleting webhook (repo: %s) ...', repo)
            self.scm_clients[vendor].delete_webhook(repo)
        except AXApiAuthFailed as e:
            logger.error('Invalid credential supplied')
            detail = {
                'repo': repo,
                'error': 'Invalid credential supplied:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_DELETION_FAILURE,
                                                                               detail=detail)
            raise AXApiInvalidParam('User authentication failed', detail=str(e))
        except AXApiForbiddenReq as e:
            logger.error('Supplied credential is valid but having insufficient permission')
            detail = {
                'repo': repo,
                'error': 'Supplied credential is valid but having insufficient permission:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_DELETION_FAILURE,
                                                                               detail=detail)
            raise AXApiInvalidParam('User has insufficient permission', detail=str(e))
        except Exception as e:
            logger.error('Failed to delete webhook: %s', e)
            detail = {
                'repo': repo,
                'error': 'Failed to delete webhook:' + str(e)
            }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_WEBHOOK_DELETION_FAILURE,
                                                                               detail=detail)
            raise AXApiInternalError('Failed to delete webhook', str(e))
        else:
            logger.info('Successfully deleted webhook (repo: %s)', repo)

        # Delete ELB
        try:
            if not self.has_webhook(repo):
                logger.info('Deleting ELB for webhook ...')
                self.axsys_client.delete_webhook()
        except Exception as e:
            logger.error('Failed to delete ELB for webhook: %s', str(e))
            detail = {'repo': repo,
                      'error': 'Failed to delete ELB for webhook' + str(e)
                      }
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_ELB_DELETION_FAILURE,
                                                                               detail=detail)
            raise AXApiInternalError('Failed to delete ELB for webhook', str(e))
        else:
            logger.info('Successfully deleted ELB for webhook')
            return {}

    def purge_branches(self, repo, branch=None):
        """Purge branch heads.

        :param repo:
        :param branch:
        :return:
        """
        if not repo:
            raise AXApiInvalidParam('Missing required parameter', 'Missing required parameter (repo)')
        logger.info('Purging branch heads (repo: %s, branch: %s) ...', repo, branch)

        try:
            if not branch:
                self.axdb_client.purge_branch_heads(repo)
            else:
                self.axdb_client.purge_branch_head(repo, branch)
        except Exception as e:
            message = 'Unable to purge branch heads'
            detail = 'Unable to purge branch heads (repo: {}, branch: {}): {}'.format(repo, branch, str(e))
            logger.error(detail)
            raise AXApiInternalError(message, detail)
        else:
            logger.info('Successfully purged branch heads')

    def get_branches(self, repo=None, branch=None, order_by=None, limit=None):
        """Get branches.

        :param repo:
        :param branch:
        :param order_by:
        :param limit:
        :return:
        """

        def _get_branches(workspace):
            """Retrieve list of remote branches in the workspace.

            :param workspace:
            :return: a list of dictionaries.
            """
            try:
                key = '{}:{}'.format(Gateway.NAMESPACE, workspace)
                if self.redis_client.exists(key):
                    logger.info('Loading cache (workspace: %s) ...', workspace)
                    results = self.redis_client.get(key, decoder=json.loads)
                    return results
                else:
                    logger.info('Scanning workspace (%s) ...', workspace)
                    git_client = GitClient(path=workspace, read_only=True)
                    repo = git_client.get_remote()
                    branches = git_client.get_remote_heads()
                    results = []
                    for i in range(len(branches)):
                        results.append({
                            'repo': repo,
                            'name': branches[i]['reference'],
                            'revision': branches[i]['commit'],
                            'commit_date': branches[i]['commit_date']
                        })
                    logger.info('Saving cache (workspace: %s) ...', workspace)
                    self.redis_client.set(key, results, expire=Gateway.BRANCH_CACHE_TTL, encoder=json.dumps)
                    return results
            except Exception as e:
                logger.warning('Failed to scan workspace (%s): %s', workspace, e)
                return []

        logger.info('Retrieving branches (repo: %s, branch: %s) ...', repo, branch)
        if repo:
            repo = unquote(repo)
            _, vendor, repo_owner, repo_name = self.parse_repo(repo)
            workspaces = ['{}/{}/{}/{}'.format(Gateway.BASE_DIR, vendor, repo_owner, repo_name)]
        else:
            dirs = [dir_name[0] for dir_name in os.walk(Gateway.BASE_DIR) if dir_name[0].endswith('/.git')]
            workspaces = list(map(lambda v: v[:-5], dirs))

        branches = []
        with ThreadPoolExecutor(max_workers=20) as executor:
            futures = []
            for i in range(len(workspaces)):
                futures.append(executor.submit(_get_branches, workspaces[i]))
            for future in as_completed(futures):
                try:
                    data = future.result()
                except Exception as e:
                    logger.warning('Unexpected exception occurred during processing: %s', e)
                else:
                    for i in range(len(data)):
                        branches.append(data[i])
        if branch:
            pattern = '.*{}.*'.format(branch.replace('*', '.*'))
            branches = [branches[i] for i in range(len(branches)) if re.match(pattern, branches[i]['name'])]
        if order_by == 'commit_date':
            branches = sorted(branches, key=lambda v: v['commit_date'])
        elif order_by == '-commit_date':
            branches = sorted(branches, key=lambda v: v['commit_date'], reverse=True)
        elif order_by == '-native':
            branches = sorted(branches, key=lambda v: (v['repo'], v['name']), reverse=True)
        else:
            branches = sorted(branches, key=lambda v: (v['repo'], v['name']))
        if limit:
            branches = branches[:limit]
        logger.info('Successfully retrieved %s branches', len(branches))
        return branches

    @staticmethod
    def _get_commits(workspace, branch=None, since=None, until=None, commit=None, author=None, committer=None,
                     description=None, limit=None):
        """Search for commits in a workspace."""
        try:
            logger.info('Scanning workspace (%s) for commits ...', workspace)
            git_client = GitClient(path=workspace, read_only=True)
            if commit and commit.startswith('~'):
                commit = commit[1:]
            if author and author.startswith('~'):
                author = author[1:]
            if committer and committer.startswith('~'):
                committer = committer[1:]
            if description and description.startswith('~'):
                description = description[1:]
            return git_client.get_commits(branch=branch, commit=commit, since=since, until=until, author=author,
                                          committer=committer, description=description, limit=limit)
        except Exception as e:
            logger.warning('Failed to scan workspace (%s): %s', workspace, e)

    @staticmethod
    def _get_commit(workspace, commit):
        """Get a commit from a workspace."""
        try:
            logger.info('Scanning workspace (%s) for commit (%s) ...', workspace, commit)
            git_client = GitClient(path=workspace, read_only=True)
            return git_client.get_commit(commit)
        except Exception as e:
            logger.warning('Failed to scan workspace (%s): %s', workspace, e)

    @staticmethod
    def _parse_repo_branch(repo, branch, repo_branch):
        """Parse repo / branch / repo_branch."""
        if repo:
            try:
                repo = unquote(repo)
                _, vendor, repo_owner, repo_name = Gateway.parse_repo(repo)
            except Exception as e:
                msg = 'Unable to parse repo: %s', e
                logger.error(msg)
                raise AXApiInvalidParam('Unable to parse repo', msg)
            else:
                dir = '{}/{}/{}/{}'.format(Gateway.BASE_DIR, vendor, repo_owner, repo_name)
                workspaces = {dir: [branch] if branch else []}
        elif repo_branch:
            try:
                repo_branch = json.loads(repo_branch)
                workspaces = {}
                for repo in repo_branch.keys():
                    repo = unquote(repo)
                    _, vendor, repo_owner, repo_name = Gateway.parse_repo(repo)
                    dir = '{}/{}/{}/{}'.format(Gateway.BASE_DIR, vendor, repo_owner, repo_name)
                    if dir not in workspaces:
                        workspaces[dir] = set()
                    for branch in repo_branch[repo]:
                        workspaces[dir].add(branch)
            except Exception as e:
                msg = 'Unable to parse repo_branch: %s' % str(e)
                logger.error(msg)
                raise AXApiInvalidParam('Unable to parse repo_branch', msg)
        else:
            dirs = [dir[0] for dir in os.walk(Gateway.BASE_DIR) if dir[0].endswith('/.git')]
            workspaces = list(map(lambda v: v[:-5], dirs))
            workspaces = dict([(k, [branch] if branch else []) for k in workspaces])
        return workspaces

    @staticmethod
    def _put_file(repo, branch, path):
        """Put a file in s3.

        :param repo:
        :param branch:
        :param path:
        :return:
        """
        _, vendor, repo_owner, repo_name = Gateway.parse_repo(repo)
        workspace = '{}/{}/{}/{}'.format(Gateway.BASE_DIR, vendor, repo_owner, repo_name)
        if not os.path.isdir(workspace):
            raise AXApiInvalidParam('Invalid repository', 'Invalid repository ({})'.format(repo))
        try:
            logger.info('Extracting file content from repository (repo: %s, branch: %s, path: %s) ...',
                        repo, branch, path)
            git_client = GitClient(path=workspace, read_only=True)
            files = git_client.get_files(branch=branch, subdir=path, binary_mode=True)
        except Exception as e:
            message = 'Failed to extract file content'
            detail = '{}: {}'.format(message, str(e))
            logger.error(detail)
            raise AXApiInternalError(message, detail)
        else:
            if len(files) == 0:
                raise AXApiInvalidParam('Unable to locate file with given information')
            file_content = files[0]['content']
            logger.info('Successfully extracted file content')

        try:
            # Cluster name id always has the form <cluster_name>-<36_bytes_long_cluster_id>
            cluster_name, cluster_id = Gateway.CLUSTER_NAME_ID[:-37], Gateway.CLUSTER_NAME_ID[-36:]
            key = '{cluster_name}/{cluster_id}/{vendor}/{repo_owner}/{repo_name}/{branch}/{path}'.format(
                cluster_name=cluster_name, cluster_id=cluster_id, vendor=vendor,
                repo_owner=repo_owner, repo_name=repo_name, branch=branch, path=path)
            logger.info('Uploading file content to s3 (bucket: %s, key: %s) ...', Gateway.S3_BUCKET_NAME, key)
            response = Gateway.s3_bucket.Object(key).put(Body=file_content)
            etag = response.get('ETag')
            if etag:
                etag = json.loads(etag)
        except Exception as e:
            message = 'Failed to upload file content'
            detail = '{}: {}'.format(message, str(e))
            logger.error(detail)
            raise AXApiInternalError(message, detail)
        else:
            logger.info('Successfully uploaded file content')
            return {'bucket': Gateway.S3_BUCKET_NAME, 'key': key, 'etag': etag}

    @staticmethod
    def _delete_file(repo, branch, path):
        """Delete a file from s3.

        :param repo:
        :param branch:
        :param path:
        :return:
        """
        _, vendor, repo_owner, repo_name = Gateway.parse_repo(repo)
        try:
            cluster_name, cluster_id = Gateway.CLUSTER_NAME_ID[:-37], Gateway.CLUSTER_NAME_ID[-36:]
            key = '{cluster_name}/{cluster_id}/{vendor}/{repo_owner}/{repo_name}/{branch}/{path}'.format(
                cluster_name=cluster_name, cluster_id=cluster_id, vendor=vendor,
                repo_owner=repo_owner, repo_name=repo_name, branch=branch, path=path)
            logger.info('Deleting file from s3 (bucket: %s, key: %s) ...', Gateway.S3_BUCKET_NAME, key)
            Gateway.s3_bucket.Object(key).delete()
        except Exception as e:
            message = 'Failed to delete file'
            detail = '{}: {}'.format(message, str(e))
            logger.error(detail)
            raise AXApiInternalError(message, detail)
        else:
            logger.info('Successfully deleted file')
            return {'bucket': Gateway.S3_BUCKET_NAME, 'key': key}

    @staticmethod
    def init_jira_client(axops_client, url=None, username=None, password=None):
        """Initialize an Jira client"""

        def get_jira_configuration():
            js = axops_client.get_tools(category='issue_management', type='jira')
            if js:
                return {'url': js[0]['url'],
                        'username': js[0]['username'],
                        'password': js[0]['password']
                        }
            else:
                return dict()

        if url is None or username is None or password is None:
            conf = get_jira_configuration()
            if not conf:
                raise AXApiInvalidParam('No JIRA configured')
            else:
                url, username, password = conf['url'], conf['username'], conf['password']
        return JiraClient(url, username, password)

    # Verify whether this function is still needed
    def check_github_whitelist(self):
        if not self.is_github_webhook_enabled():
            logger.info('No GitHub webhook configured')
            return
        configured = self.get_from_cache()
        logger.info('The configured GitHub webhook whitelist is %s', configured)
        advertised = self.scm_clients[ScmVendors.GITHUB].get_webhook_whitelist()
        logger.info('The GitHub webhook whitelist is %s', advertised)
        if set(configured) == set(advertised):
            logger.info('No update needed')
        else:
            # Create ELB
            payload = {'ip_range': advertised, 'external_port': 8443, 'internal_port': 8087}
            try:
                logger.info('Creating ELB for webhook ...')
                self.axsys_client.create_webhook(**payload)
            except Exception as exc:
                logger.error('Failed to create ELB for webhook: %s', str(exc))
                self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_ELB_CREATION_FAILURE,
                                                                                   detail=payload)
            else:
                # Update cache
                self.write_to_cache(advertised)
                logger.info('Successfully updated ELB for webhook')

    def is_github_webhook_enabled(self):
        """ Check whether the webhook is configured or not"""
        github_data = self.axops_client.get_tools(type='github')
        use_webhook = [each for each in github_data if each['use_webhook']]
        return bool(use_webhook)

    @staticmethod
    def write_to_cache(ip_range):
        """ Store the webhook whitelist info"""
        cache_file = '/tmp/github_webhook_whitelist'
        with open(cache_file, 'w+') as f:
            f.write(json.dumps(ip_range))

    def get_from_cache(self):
        """ Get cached webhook whitelist info, otherwise get from axmon"""
        cache_file = '/tmp/github_webhook_whitelist'
        ip_range = list()
        if os.path.exists(cache_file):
            with open(cache_file, 'r+') as f:
                data = f.readlines()
                ip_range = json.loads(data[0])
        else:
            logger.debug('No cache file')
            try:
                data = self.axsys_client.get_webhook()
            except Exception as exc:
                logger.warning(exc)
            else:
                logger.info('Write whitelist info to cache file')
                ip_range = data['ip_ranges']
                self.write_to_cache(ip_range)
        return ip_range
Пример #6
0
 def __init__(self, axops_host=None):
     self._schedule_lock = threading.Lock()
     self.axops_client = AxopsClient(host=axops_host)
     self.scheduler = BackgroundScheduler()
     self.event_notification_client = EventNotificationClient(
         FACILITY_AX_SCHEDULER)
Пример #7
0
class JobScheduler(object):
    def __init__(self, axops_host=None):
        self._schedule_lock = threading.Lock()
        self.axops_client = AxopsClient(host=axops_host)
        self.scheduler = BackgroundScheduler()
        self.event_notification_client = EventNotificationClient(
            FACILITY_AX_SCHEDULER)

    def init(self):
        """
        Init Job Scheduler. Check access to AxOps.
        """
        counter = 0
        while counter < 20:
            if self.axops_client.ping():
                self.refresh_scheduler()
                return
            else:
                counter += 1
                logger.info("JobScheduler cannot ping AxOps. Count: %s",
                            counter)
                time.sleep(10)
        logger.error(
            "[Init] scheduler failed to ping AxOps after 20 tries. Exit.")
        sys.exit(1)

    def refresh_scheduler(self):
        """
        Refresh the job scheduler.

        The major functionality of this service. Read all the cron policies from AxOps, then
        load the schedules into the job scheduler.
        """

        if self._schedule_lock.acquire(
                timeout=2):  # Try to acquire lock for 2 seconds
            try:
                scheduler = BackgroundScheduler()
                logger.info("Start refreshing the scheduler.")

                for policy in self.axops_client.get_policy(enabled=True):
                    self.add_policy(policy, scheduler)

                # Scheduler swap
                self.stop_scheduler()
                self.scheduler = scheduler
                self.scheduler.start()
                logger.info(
                    "Successfully finish refreshing the scheduler. \n%s",
                    AxPrettyPrinter().pformat(self.get_schedules()))
                return {}
            finally:
                self._schedule_lock.release()
        else:
            with self._schedule_lock:
                logger.info(
                    "Some other thread is refreshing the scheduler. Instant return."
                )
            return {'Details': 'Instant return'}

    def add_policy(self, policy, scheduler):
        """
        Add a schedule into scheduler based on policy.

        Ignore exceptions (for now).
        """
        try:
            policy_json = policy_schema(policy)
            policy_id = policy_json['id']
            event_list = policy_json['when']
            logger.info("Processing policy, %s", policy_id)
            for event in event_list:
                if event.get('event', None) != 'on_cron':
                    continue
                event_json = schedule_schema(event)
                cron_str = event_json['schedule'].strip().split(
                    ' ')  # Parse the cron string
                assert len(cron_str) == 5, "Invalid cron schedule format"
                logger.info("Adding cron event, \n %s",
                            AxPrettyPrinter().pformat(event_json))
                scheduler.add_job(
                    self.create_service,
                    'cron',  # Add cron job into scheduler
                    id='{}-{}'.format(policy_id, cron_str),
                    args=[policy_json],
                    minute=cron_str[0],
                    hour=cron_str[1],
                    day=cron_str[2],
                    month=cron_str[3],
                    day_of_week=cron_str[4],
                    timezone=event_json['timezone'])
        except MultipleInvalid as e:
            logger.exception("Invalid cron policy format, \n%s. Details: %s",
                             AxPrettyPrinter().pformat(policy), str(e))
            try:
                if 'when' in policy:
                    policy['when'] = json.dumps(policy['when'])
                self.event_notification_client.send_message_to_notification_center(
                    CODE_JOB_SCHEDULER_INVALID_POLICY_DEFINITION,
                    detail=policy)
            except Exception:
                logger.exception(
                    "Failed to send out alert to notification center.")
        except AssertionError as e:
            logger.exception(
                "Invalid cron policy format, \n%s, cron string. Details: %s",
                AxPrettyPrinter().pformat(policy), str(e))
            try:
                if 'when' in policy:
                    policy['when'] = json.dumps(policy['when'])
                self.event_notification_client.send_message_to_notification_center(
                    CODE_JOB_SCHEDULER_INVALID_CRON_EXPRESSION, detail=policy)
            except Exception:
                logger.exception(
                    "Failed to send out alert to notification center.")
        except Exception as e:
            logger.exception(
                "Failed to add event, \n%s into scheduler. Details: %s",
                AxPrettyPrinter().pformat(policy), str(e))
            try:
                if 'when' in policy:
                    policy['when'] = json.dumps(policy['when'])
                self.event_notification_client.send_message_to_notification_center(
                    CODE_JOB_SCHEDULER_CANNOT_ADD_POLICY, detail=policy)
            except Exception:
                logger.exception(
                    "Failed to send out alert to notification center.")

    @staticmethod
    def is_matched(target_branches, branch_name):
        """
        Check the regex of target branches can be matched with branch name.
        """
        is_matched = False
        for branch in target_branches:
            try:
                if re.compile(branch).match(branch_name):
                    is_matched = True
                    break
            except Exception as e:
                logger.exception("Failed to compare using regex. %s", str(e))
                pass
        return is_matched

    def create_service(self, policy):
        """
        Create job based on the policy.

        The payload is tailored for the AxOps POST /v1/services. This might get improved in the future.
        """
        logger.info(
            "Start triggering job based on cron schedule. Policy info: \n%s",
            AxPrettyPrinter().pformat(policy))

        service_template = self.axops_client.get_templates(
            policy['repo'], policy['branch'], name=policy['template'])[0]
        commit_res = self.axops_client.get_commit_info(repo=policy['repo'],
                                                       branch=policy['branch'],
                                                       limit=1)
        if not commit_res or len(commit_res) != 1:
            logger.error(
                "Error retrieving latest commit info for cron job, commit_info: %s. Return",
                commit_res)
            return
        commit_json = commit_schema(commit_res[0])
        notification_info = policy['notifications']

        commit_info = {
            'revision': commit_json['revision'],
            'repo': commit_json['repo'],
            'branch': commit_json['branch'],
            'author': commit_json['author'],
            'committer': commit_json['committer'],
            'description': commit_json['description'],
            'date': commit_json['date']
        }

        parameters = copy.deepcopy(policy['arguments'])
        parameters['session.commit'] = commit_json['revision']
        parameters['session.branch'] = commit_json['branch']
        parameters['session.repo'] = commit_json['repo']

        service = {
            'template_id': service_template['id'],
            'arguments': parameters,
            'policy_id': policy['id'],
            'commit': commit_info,
        }

        if notification_info:
            service['notifications'] = notification_info

        logger.info("Creating new service with the following payload ...\n%s",
                    AxPrettyPrinter().pformat(service))
        service = self.axops_client.create_service(service)
        logger.info('Successfully created service (id: %s)', service['id'])

    def get_schedules(self):
        """
        Get the scheduled jobs in the current scheduler.
        :return: list of scheduled jobs.
        """
        result = dict()
        if self.scheduler:
            for job in self.scheduler.get_jobs():
                result[job.id] = str(job)
        return result

    def stop_scheduler(self, wait=False):
        """
        Stop the current scheduler.
        :param wait: whether to wait for the current running job.
        :return:
        """
        if self.scheduler.running:
            self.scheduler.shutdown(wait=wait)
Пример #8
0
#!/usr/bin/env python
#
# Copyright 2015-2017 Applatix, Inc.  All rights reserved.
#
# -*- coding: utf-8 -*-
#
from ax.devops.kafka.kafka_client import EventNotificationClient
from ax.notification_center import FACILITY_GATEWAY

event_notification_client = EventNotificationClient(FACILITY_GATEWAY)
Пример #9
0
class BaseScmRestClient(metaclass=abc.ABCMeta):
    """Base REST API wrapper for SCM."""

    event_notification_client = EventNotificationClient(
        notification_center.FACILITY_GATEWAY)

    WEBHOOK_TITLE = 'AX CI - {}'.format(os.environ.get('AX_CLUSTER'))
    WEBHOOK_JOB_URL = 'https://{}/app/jobs/job-details'.format(
        os.environ.get('AXOPS_EXT_DNS'))

    EXCEPTION_MAPPING = {
        400: exceptions.AXApiInvalidParam,
        401: exceptions.AXApiAuthFailed,
        403: exceptions.AXApiForbiddenReq,
        404: exceptions.AXApiResourceNotFound,
        405: exceptions.AXApiResourceNotFound
    }

    STATUS_MAPPING = {}

    def __init__(self):
        """Initializer."""
        self.axops_client = AxopsClient()
        self.axsys_client = AxsysClient()
        self.repos = {}
        self.urls = {}

    def construct_webhook_url(self):
        """Construct the URL of webhook"""
        try:
            payload = self.axsys_client.get_webhook()
            dnsname = payload['hostname']
            port = payload['port_spec'][0]['port']
        except Exception as e:
            message = 'Unable to extract dnsname or port of webhook'
            detail = 'Unable to extract dnsname or port of webhook: {}'.format(
                str(e))
            logger.error(detail)
            raise exceptions.AXApiInternalError(message, detail)
        else:
            return 'https://{}:{}/v1/webhooks/scm'.format(dnsname, port)

    @abc.abstractmethod
    def get_repos(self, username, password):
        """Get repos that an account can see.

        :param username:
        :param password:
        :return:
        """

    @abc.abstractmethod
    def get_commit(self, repo, commit):
        """Get commit info.

        :param repo:
        :param commit:
        :return:
        """

    @abc.abstractmethod
    def get_branch_head(self, repo, branch):
        """Get HEAD of a branch.

        :param repo:
        :param branch:
        :return:
        """

    def has_webhook(self, repo):
        """Test if webhook is configured on this repo.

        :param repo:
        :return:
        """
        return bool(self.get_webhook(repo))

    @abc.abstractmethod
    def get_webhook(self, repo):
        """Get webhook.

        :param repo:
        :return:
        """

    @abc.abstractmethod
    def get_webhooks(self, repo):
        """Get all webhooks.

        :param repo:
        :return:
        """

    @abc.abstractmethod
    def create_webhook(self, repo):
        """Create webhook.

        :param repo:
        :return:
        """

    @abc.abstractmethod
    def delete_webhook(self, repo):
        """Delete webhook.

        :param repo:
        :return:
        """

    @abc.abstractmethod
    def upload_job_result(self, payload):
        """Upload job result to bitbucket.

        :param payload:
        :return:
        """

    @abc.abstractmethod
    def get_yamls(self, repo, commit):
        """Get all YAML files in .argo folder.

        :param repo:
        :param commit:
        :return:
        """

    def get_repo_info(self, repo):
        """Get owner, name, and credential of repo.

        :param repo:
        :return:
        """
        if repo not in self.repos:
            tool_config = self.axops_client.get_tool(repo)
            if not tool_config:
                self.event_notification_client.send_message_to_notification_center(
                    notification_center.CODE_JOB_CI_REPO_NOT_FOUND,
                    detail={'repo': repo})
                raise UnknownRepository(
                    'Unable to find configuration for repo ({})'.format(repo))
            self.update_repo_info(repo, tool_config['type'],
                                  tool_config['username'],
                                  tool_config['password'])
        return (self.repos[repo]['owner'], self.repos[repo]['name'],
                self.repos[repo]['username'], self.repos[repo]['password'])

    def update_repo_info(self, repo, type, username, password):
        """Update repo info.

        :param repo:
        :param type:
        :param username:
        :param password:
        :return:
        """
        strs = repo.split('/')
        owner, name = strs[-2], strs[-1].split('.')[0]
        self.repos[repo] = {
            'type': type,
            'name': name,
            'owner': owner,
            'username': username,
            'password': password
        }

    def make_request(self, f, url, *args, **kwargs):
        """Make an HTTP request.

        :param f:
        :param url:
        :param args:
        :param kwargs:
        :returns:
        """
        logger.info('Request: %s %s', f.__name__, url)
        value_only = kwargs.pop('value_only', False)
        resp = f(url, *args, **kwargs)
        self.handle_exception(resp)
        if value_only:
            return resp.json()
        else:
            return resp

    def handle_exception(self, response):
        """Handle exception in response.

        :param response:
        :return:
        """
        try:
            response.raise_for_status()
        except requests.ConnectionError as e:
            logger.error('Connection error: %s %s', e.response.status_code,
                         e.response.text)
            raise exceptions.AXTimeoutException('Connection timeout', str(e))
        except requests.HTTPError as e:
            logger.error('HTTP error: %s %s', e.response.status_code,
                         e.response.text)
            exception_class = self.EXCEPTION_MAPPING.get(
                response.status_code) or exceptions.AXApiInternalError
            raise exception_class('HTTP error', str(e))
        else:
            logger.info('Response: %s', response.status_code)
Пример #10
0
from result.serializers import ResultSerializer

from ax.devops.axdb.axdb_client import AxdbClient
from ax.devops.redis.redis_client import RedisClient, DB_RESULT
from ax.devops.kafka.kafka_client import EventNotificationClient
from ax.exceptions import AXApiInvalidParam
from ax.notification_center import FACILITY_PLATFORM, CODE_PLATFORM_ERROR

logger = logging.getLogger('{}.{}'.format(LOGGER_NAME, 'result'))

axdb_client = AxdbClient()
redis_client = RedisClient(host='redis.axsys',
                           db=DB_RESULT,
                           retry_max_attempt=10,
                           retry_wait_fixed=5000)
event_notification_client = EventNotificationClient(FACILITY_PLATFORM)


class ResultViewSet(ListModelMixin, CreateModelMixin, RetrieveModelMixin,
                    DestroyModelMixin, GenericViewSet):
    """View set for result."""

    queryset = Result.objects.all()
    serializer_class = ResultSerializer

    @detail_route(methods=[
        'GET',
    ])
    def approval(self, request, *args, **kwargs):
        """Save an approval result in redis."""
        token = request.query_params.get('token', None)
Пример #11
0
class EventTrigger(object):
    """DevOps event trigger."""

    axops_client = AxopsClient()
    event_notification_client = EventNotificationClient(FACILITY_AX_EVENT_TRIGGER)
    event_keys = {
        AxEventTypes.PUSH: [
            'repo',
            'branch',
            'commit',
            'author'
        ],
        AxEventTypes.CREATE: [
            'repo',
            'branch',
            'commit',
            'author'
        ],
        AxEventTypes.PULL_REQUEST: [
            'repo',
            'branch',
            'commit',
            'target_repo',
            'target_branch',
            'author'
        ],
        AxEventTypes.PULL_REQUEST_MERGE: [
            'repo',
            'branch',
            'commit',
            'source_repo',
            'source_branch',
            'author'
        ]
    }

    def evaluate(self, event):
        """Evaluate an event by enforcing its applicable event policies, and trigger a service instance.

        :param event:
        :return:
        """
        logger.info('Evaluating AX event ...')
        # TODO: Currently, git and codecommit do not support the update of YAML files
        if event['vendor'] not in {'git', 'codecommit'}:
            logger.info('Updating policies/templates (repo: %s, branch: %s) ...', event['repo'], event['branch'])
            payload = {
                'type': event['vendor'],
                'repo': event['repo'],
                'branch': event['branch']
            }
            # Not sure what it does
            if len(event.get('commit', '')) == 36:
                logger.info('It is a commit, try to update the YAML file')
                resp = requests.post('http://gateway:8889/v1/scm/yamls', json=payload)
                if 400 <= resp.status_code < 600:
                    raise YamlUpdateError('Failed to update YAML content', detail='Failed to update policy/template')

        services = []
        # If the event does not have command section or the command is rerun, we need to enforce policies
        if 'command' not in event or event.get('command') == AxCommands.RERUN:
            logger.info('Searching for applicable event policies ...')
            applicable_event_policies = self.get_applicable_event_policies(event)
            if applicable_event_policies:
                logger.info('Found %s applicable event policies', len(applicable_event_policies))
                for i in range(len(applicable_event_policies)):
                    # If we only need to run failed jobs, we need to retrieve the status of last job
                    if event.get('command') == AxCommands.RERUN and not event.get('rerun_all', False):
                        most_recent_service = self.axops_client.get_most_recent_service(
                            event['repo'], event['commit'], applicable_event_policies[i]['id']
                        )
                        if most_recent_service['status'] >= 0:
                            logger.info('Most recent service is successful or still running; since user selects '
                                        'to rerun failed jobs, the creation of this service will be skipped')
                            continue
                    service = self.enforce_policy(applicable_event_policies[i], event)
                    if service:
                        services.append(service)
            else:
                logger.warning('Found 0 applicable event policies, skip processing')
        # If the command is run, we do not need to enforce policies
        else:
            logger.info('Received event with command, kicking off service now ...')
            service = self.run_command(event)
            if service:
                services.append(service)
        logger.info('Evaluation completed')
        return services or []

    def get_applicable_event_policies(self, event):
        """Get applicable event policies.

        :param event:
        :return:
        """
        applicable_policies = []
        if event['type'] == AxEventTypes.PULL_REQUEST:
            key_repo, key_branch = 'target_repo', 'target_branch'
        else:
            key_repo, key_branch = 'repo', 'branch'
        policies = self.axops_client.get_policies(repo=event[key_repo], branch=event[key_branch], enabled=True)
        logger.info("Found enabled policies with repo %s, branch %s, %s", event[key_repo], event[key_branch], policies)

        for policy in policies:
            if self.match(policy, event):
                applicable_policies.append(policy)

        return applicable_policies

    def match(self, policy, event):
        """Determine if a policy is applicable to an event.

        :param policy:
        :param event:
        :return:
        """
        conditions = policy['when']
        for condition in conditions:
            if self.match_condition(condition, event):
                return True
        return False

    def match_condition(self, condition, event):
        """Match event with a condition.

        :param condition:
        :param event:
        :return:
        """
        if event['type'] == AxEventTypes.CREATE and not condition['event'].endswith('tag'):
            return False
        elif event['type'] != AxEventTypes.CREATE and not condition['event'].endswith(event['type']):
            return False

        return True

    @staticmethod
    def match_patterns(patterns, string):
        """Match branch.

        :param patterns:
        :param string:
        :return:
        """
        for pattern in patterns:
            try:
                match = re.match(pattern, string)
            except Exception as e:
                logger.error('Failed to match pattern (pattern: %s, string: %s): %s', pattern, string, e)
                raise AXApiInternalError('Failed to match pattern', detail=str(e))
            else:
                if match:
                    return True
        return False

    def enforce_policy(self, policy, event):
        """Enforce policy.

        :param policy:
        :param event:
        :return:
        """
        # Retrieve service template payload
        logger.info('Retrieving service template ...')
        service_template = self.get_service_template_by_policy(policy)
        if not service_template:
            logger.warning('Unable to find service template, skip')
            return
        # Construct parameters from event and policy
        logger.info('Constructing parameters ...')
        parameters = self.construct_parameters(event=event, policy=policy)
        notifications = self.construct_notifications(policy)
        commit = self.construct_commit_info(event)
        # Create service instance
        service = {
            'commit': commit,
            'notifications': notifications,
            'arguments': parameters,
            'policy_id': policy['id'],
            'template': service_template
        }
        service = self.axops_client.create_service(service)
        logger.info('Successfully created service (id: %s)', service['id'])
        return service

    def run_command(self, event):
        """Run command specified in the event.

        :param event:
        :return:
        """
        # Retrieve service template payload
        logger.info('Retrieving service template ...')
        service_template = self.get_service_template_by_event(event)
        if not service_template:
            logger.warning('Unable to find service template, skip')
            self.event_notification_client.send_message_to_notification_center(CODE_JOB_CI_TEMPLATE_NOT_FOUND, detail=event)
            return
        # Construct parameters from event
        logger.info('Constructing parameters ...')
        parameters = self.construct_parameters(event=event)
        for key in event.get('arguments', {}):
            parameters[key] = event['arguments'][key]
        notifications = self.construct_notifications()
        commit = self.construct_commit_info(event)
        # Create service instance
        service = {
            'commit': commit,
            'notifications': notifications,
            'arguments': parameters,
            'template': service_template
        }
        service = self.axops_client.create_service(service)
        logger.info('Successfully created service (id: %s)', service['id'])
        return service

    def get_service_template_by_policy(self, policy):
        """Get service template by policy.

        :param policy:
        :return:
        """
        service_templates = self.axops_client.get_templates(policy['repo'], policy['branch'], policy['template'])
        if service_templates:
            return service_templates[0]

    def get_service_template_by_event(self, event):
        """Get service template by event.

        :param event:
        :return:
        """
        service_templates = self.axops_client.get_templates(event['repo'], event['branch'], event['template'])
        if service_templates:
            return service_templates[0]

    def construct_parameters(self, event, policy=None):
        """Construct parameters.

        :param event:
        :param policy:
        :return:
        """
        parameters_from_event = self.event_to_parameters(event)
        parameters = policy.get('arguments', {}) if policy else {}
        parameters.update(parameters_from_event)
        return parameters

    @staticmethod
    def construct_notifications(policy=None):
        """Construct notifications.

        :param policy:
        :return:
        """
        notifications = [
            {
                "whom": [
                    "scm"
                ],
                "when": [
                    "on_success",
                    "on_failure"
                ]
            }
        ]
        if policy and 'notifications' in policy:
            notifications += policy['notifications']
        return notifications

    @staticmethod
    def construct_commit_info(event):
        """Construct commit info.

        :param event:
        :return:
        """
        return {
            'revision': event['commit'],
            'repo': event['repo'],
            'branch': event['branch'],
            'author': event['author'],
            'committer': event['committer'],
            'description': event['description'],
            'date': int((datetime.datetime.strptime(event['date'], '%Y-%m-%dT%H:%M:%S') -
                         datetime.datetime(1970, 1, 1)).total_seconds())
        }

    def event_to_parameters(self, event):
        """Create parameters from event.

        :param event:
        :return:
        """
        parameters = {}
        for k in self.event_keys[event['type']]:
            key = 'session.{}'.format(k)
            parameters[key] = event[k]
        if event['type'] in [AxEventTypes.PUSH, AxEventTypes.CREATE]:
            parameters['session.target_branch'] = event['branch']
        return parameters