예제 #1
0
        def load_identities_file(filename, reset=False):
            """
            Load an identities file in Sortinghat with reset option

            The reset option cleans all merges and affiliations in identities that will be
            loaded to honor the identities grouping from the file.
            """
            if reset:
                logger.info("[sortinghat] Loading identities with reset from file %s", filename)
                code = Load(**self.sh_kwargs).run("--reset", "--identities", filename)
            else:
                logger.info("[sortinghat] Loading identities from file %s", filename)
                code = Load(**self.sh_kwargs).run("--identities", filename)
            if code != CMD_SUCCESS:
                logger.error("[sortinghat] Error loading %s", filename)
            logger.info("[sortinghat] End of loading identities from file %s", filename)
예제 #2
0
        def load_identities_file(filename, reset=False):
            """
            Load an identities file in Sortinghat with reset option

            The reset option cleans all merges and affiliations in identities that will be
            loaded to honor the identities grouping from the file.
            """
            if reset:
                logger.info(
                    "[sortinghat] Loading identities with reset from file %s",
                    filename)
                code = Load(**self.sh_kwargs).run("--reset", "--identities",
                                                  filename)
            else:
                content_hash = get_file_hash(filename)
                if content_hash not in self.current_identities_files_hash:
                    logger.info("[sortinghat] Loading identities from file %s",
                                filename)
                    code = Load(**self.sh_kwargs).run("--identities", filename)
                    self.current_identities_files_hash.update({
                        content_hash: {
                            'filename': filename,
                            'has_changed': 1
                        }
                    })
                else:
                    self.current_identities_files_hash.update({
                        content_hash: {
                            'filename': filename,
                            'has_changed': 0
                        }
                    })
                    logger.info(
                        "[sortinghat] No changes in file %s, identities won't be loaded",
                        filename)
                    code = CMD_SUCCESS

            if code != CMD_SUCCESS:
                logger.error("[sortinghat] Error loading %s", filename)
            logger.info("[sortinghat] End of loading identities from file %s",
                        filename)
예제 #3
0
    def run(self):

        # code = 0 when command success
        code = Init(**self.sh_kwargs).run(self.db_sh)

        if self.load_orgs:
            logger.info("[sortinghat] Loading orgs from file %s",
                        self.conf['sh_orgs_file'])
            code = Load(**self.sh_kwargs).run("--orgs",
                                              self.conf['sh_orgs_file'])
            if code != CMD_SUCCESS:
                logger.error("[sortinghat] Error loading %s",
                             self.conf['sh_orgs_file'])
            #FIXME get the number of loaded orgs

        if 'sh_ids_file' in self.conf.keys():
            filenames = self.conf['sh_ids_file'].split(',')
            for f in filenames:
                logger.info("[sortinghat] Loading identities from file %s", f)
                f = f.replace(' ', '')
                code = Load(**self.sh_kwargs).run("--identities", f)
                if code != CMD_SUCCESS:
                    logger.error("[sortinghat] Error loading %s", f)
예제 #4
0
    def setUp(self):
        if not hasattr(sys.stdout, 'getvalue'):
            self.fail('This test needs to be run in buffered mode')

        # Create a connection to check the contents of the registry
        self.db = Database(DB_USER, DB_PASSWORD, DB_NAME, DB_HOST, DB_PORT)

        self._load_test_dataset()

        # Create command
        self.kwargs = {
            'user': DB_USER,
            'password': DB_PASSWORD,
            'database': DB_NAME,
            'host': DB_HOST,
            'port': DB_PORT
        }
        self.cmd = Load(**self.kwargs)
예제 #5
0
    def execute(self):
        def is_remote(filename):
            """ Naive implementation. To be evolved """
            remote = False
            if 'http' in filename:
                return True
            return remote

        def load_identities_file(filename, reset=False):
            """
            Load an identities file in Sortinghat with reset option

            The reset option cleans all merges and affiliations in identities that will be
            loaded to honor the identities grouping from the file.
            """
            if reset:
                logger.info(
                    "[sortinghat] Loading identities with reset from file %s",
                    filename)
                code = Load(**self.sh_kwargs).run("--reset", "--identities",
                                                  filename)
            else:
                logger.info("[sortinghat] Loading identities from file %s",
                            filename)
                code = Load(**self.sh_kwargs).run("--identities", filename)
            if code != CMD_SUCCESS:
                logger.error("[sortinghat] Error loading %s", filename)
            logger.info("[sortinghat] End of loading identities from file %s",
                        filename)

        def load_sortinghat_identities(config):
            """ Load identities from a file in SortingHat JSON format """

            cfg = config.get_conf()

            filenames = cfg['sortinghat']['identities_file']
            api_token = cfg['sortinghat']['identities_api_token']
            for filename in filenames:
                filename = filename.replace(
                    ' ', '')  # spaces used in config file list
                if filename == '':
                    continue
                if is_remote(filename):
                    # Use the GitHub Data API to get the file
                    # First we need the SHA for this file
                    try:
                        # https://github.com/<owner>/<repo>/blob/<branch>/<sh_identities>
                        repo_file = filename.rsplit("/", 1)[1]
                        repository_raw = filename.rsplit("/", 1)[0]
                        repository = repository_raw.rsplit("/", 2)[0]
                        repository_api = repository.replace(
                            'github.com', 'api.github.com/repos')
                        # repository_type = repository_raw.rsplit("/", 2)[1]
                        repository_branch = repository_raw.rsplit("/", 2)[2]
                        repo_file_sha = \
                            TaskIdentitiesExport.sha_github_file(config, repo_file,
                                                                 repository_api, repository_branch)
                        if not repo_file_sha:
                            logger.error(
                                "Can't find identities file %s. Not loading identities",
                                filename)
                            return
                        file_url = repository_api + "/git/blobs/" + repo_file_sha
                        headers = {"Authorization": "token " + api_token}
                        res = requests.get(file_url, headers=headers)
                        res.raise_for_status()
                        with tempfile.NamedTemporaryFile() as temp:
                            temp.write(base64.b64decode(res.json()['content']))
                            temp.flush()
                            load_identities_file(
                                temp.name, cfg['sortinghat']['reset_on_load'])
                    except IndexError as ex:
                        logger.error("Can not load identities from: %s",
                                     filename)
                        logger.debug(
                            "Expected format: https://github.com/owner/repo/blob/master/file"
                        )
                        logger.debug(ex)
                else:
                    load_identities_file(filename,
                                         cfg['sortinghat']['reset_on_load'])

        def load_grimoirelab_identities(config):
            """ Load identities from files in GrimoireLab YAML format """

            logger.info("Loading GrimoireLab identities in SortingHat")

            cfg = config.get_conf()

            # Get the identities
            identities_url = cfg['sortinghat']['identities_file'][0]

            if not is_remote(identities_url):
                identities_filename = identities_url
            else:
                # The file should be in gitlab in other case
                if 'identities_api_token' not in cfg['sortinghat']:
                    logger.error(
                        "API Token not provided. Identities won't be loaded")
                    return
                token = cfg['sortinghat']['identities_api_token']
                res = requests.get(identities_url,
                                   headers={"PRIVATE-TOKEN": token})
                res.raise_for_status()
                identities = tempfile.NamedTemporaryFile()
                identities.write(res.content)
                identities_filename = identities.name

            # Convert to a JSON file in SH format
            # grimoirelab2sh -i identities.yaml -s ssf:manual -o ssf.json
            json_identities = tempfile.mktemp()
            cmd = [
                'grimoirelab2sh', '-i', identities_filename, '-s',
                cfg['general']['short_name'] + ':manual', '-o', json_identities
            ]
            if self.__execute_command(cmd) != 0:
                logger.error('Can not generate the SH JSON file from ' +
                             'GrimoireLab yaml file. Do the files exists? ' +
                             'Is the API token right?')
            else:

                # Load the JSON file in SH format
                load_identities_file(json_identities,
                                     cfg['sortinghat']['reset_on_load'])

                # Closing tmp files so they are removed for the remote case
                if is_remote(identities_url):
                    identities.close()

                os.remove(json_identities)

        # ** START SYNC LOGIC **
        # Check that enrichment tasks are not active before loading identities
        while True:
            time.sleep(
                1)  # check each second if the identities load could start
            with TasksManager.IDENTITIES_TASKS_ON_LOCK:
                with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK:
                    enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON
                    logger.debug("Enrich tasks active: %i", enrich_tasks)
                    if enrich_tasks == 0:
                        # The load of identities can be started
                        TasksManager.IDENTITIES_TASKS_ON = True
                        break
        #  ** END SYNC LOGIC **

        cfg = self.config.get_conf()

        # code = 0 when command success
        code = Init(**self.sh_kwargs).run(self.db_sh)

        # Basic loading of organizations from a SH JSON file. Legacy stuff.
        if 'load_orgs' in cfg['sortinghat'] and cfg['sortinghat']['load_orgs']:
            if 'orgs_file' not in cfg[
                    'sortinghat'] or not cfg['sortinghat']['orgs_file']:
                logger.error("Load orgs active but no orgs_file configured")
            else:
                logger.info("[sortinghat] Loading orgs from file %s",
                            cfg['sortinghat']['orgs_file'])
                code = Load(**self.sh_kwargs).run(
                    "--orgs", cfg['sortinghat']['orgs_file'])
                if code != CMD_SUCCESS:
                    logger.error("[sortinghat] Error loading %s",
                                 cfg['sortinghat']['orgs_file'])
                # FIXME get the number of loaded orgs

        # Identities loading from files. It could be in several formats.
        # Right now GrimoireLab and SortingHat formats are supported
        if 'identities_file' in cfg['sortinghat']:
            if cfg['sortinghat']['identities_format'] == 'sortinghat':
                load_sortinghat_identities(self.config)
            elif cfg['sortinghat']['identities_format'] == 'grimoirelab':
                load_grimoirelab_identities(self.config)

        # FIXME: If there are exceptions in the above code the
        # TasksManager.IDENTITIES_TASKS_ON won't be deactivated
        with TasksManager.IDENTITIES_TASKS_ON_LOCK:
            TasksManager.IDENTITIES_TASKS_ON = False
예제 #6
0
    def execute(self):
        def is_remote(filename):
            """ Naive implementation. To be evolved """
            remote = False
            if 'http' in filename:
                return True
            return remote

        def load_identities_file(filename, reset=False):
            """
            Load an identities file in Sortinghat with reset option

            The reset option cleans all merges and affiliations in identities that will be
            loaded to honor the identities grouping from the file.
            """
            if reset:
                logger.info(
                    "[sortinghat] Loading identities with reset from file %s",
                    filename)
                code = Load(**self.sh_kwargs).run("--reset", "--identities",
                                                  filename)
            else:
                content_hash = get_file_hash(filename)
                if content_hash not in self.current_identities_files_hash:
                    logger.info("[sortinghat] Loading identities from file %s",
                                filename)
                    code = Load(**self.sh_kwargs).run("--identities", filename)
                    self.current_identities_files_hash.update({
                        content_hash: {
                            'filename': filename,
                            'has_changed': 1
                        }
                    })
                else:
                    self.current_identities_files_hash.update({
                        content_hash: {
                            'filename': filename,
                            'has_changed': 0
                        }
                    })
                    logger.info(
                        "[sortinghat] No changes in file %s, identities won't be loaded",
                        filename)
                    code = CMD_SUCCESS

            if code != CMD_SUCCESS:
                logger.error("[sortinghat] Error loading %s", filename)
            logger.info("[sortinghat] End of loading identities from file %s",
                        filename)

        def load_sortinghat_identities(config):
            """ Load identities from a file in SortingHat JSON format """

            cfg = config.get_conf()

            filenames = cfg['sortinghat']['identities_file']
            api_token = cfg['sortinghat']['identities_api_token']
            for filename in filenames:
                filename = filename.replace(
                    ' ', '')  # spaces used in config file list
                if filename == '':
                    continue
                if is_remote(filename):
                    # Use the GitHub Data API to get the file
                    # First we need the SHA for this file
                    try:
                        # https://github.com/<owner>/<repo>/blob/<branch>/<sh_identities>
                        repo_file = filename.rsplit("/", 1)[1]
                        repository_raw = filename.rsplit("/", 1)[0]
                        repository = repository_raw.rsplit("/", 2)[0]
                        repository_api = repository.replace(
                            'github.com', 'api.github.com/repos')
                        # repository_type = repository_raw.rsplit("/", 2)[1]
                        repository_branch = repository_raw.rsplit("/", 2)[2]
                        repo_file_sha = \
                            TaskIdentitiesExport.sha_github_file(config, repo_file,
                                                                 repository_api, repository_branch)
                        if not repo_file_sha:
                            logger.error(
                                "Can't find identities file %s. Not loading identities",
                                filename)
                            return
                        file_url = repository_api + "/git/blobs/" + repo_file_sha
                        headers = {"Authorization": "token " + api_token}
                        res = requests.get(file_url, headers=headers)
                        res.raise_for_status()
                        with tempfile.NamedTemporaryFile() as temp:
                            temp.write(base64.b64decode(res.json()['content']))
                            temp.flush()
                            load_identities_file(
                                temp.name, cfg['sortinghat']['reset_on_load'])
                    except IndexError as ex:
                        logger.error("Can not load identities from: %s",
                                     filename)
                        logger.debug(
                            "Expected format: https://github.com/owner/repo/blob/master/file"
                        )
                        logger.debug(ex)
                else:
                    load_identities_file(filename,
                                         cfg['sortinghat']['reset_on_load'])

        def load_grimoirelab_identities(config):
            """ Load identities from files in GrimoireLab YAML format """

            logger.info("Loading GrimoireLab identities in SortingHat")

            cfg = config.get_conf()

            # Get the identities
            identities_url = cfg['sortinghat']['identities_file'][0]

            if not is_remote(identities_url):
                identities_filename = identities_url
            else:
                # The file should be in gitlab in other case
                if 'identities_api_token' not in cfg['sortinghat']:
                    logger.error(
                        "API Token not provided. Identities won't be loaded")
                    return
                token = cfg['sortinghat']['identities_api_token']
                res = requests.get(identities_url,
                                   headers={"PRIVATE-TOKEN": token})
                res.raise_for_status()
                identities = tempfile.NamedTemporaryFile()
                identities.write(res.content)
                identities.flush()
                identities_filename = identities.name

            # Convert to a JSON file in SH format
            # grimoirelab2sh -i identities.yaml -s ssf:manual -o ssf.json
            json_identities = tempfile.mktemp()
            cmd = [
                'grimoirelab2sh', '-i', identities_filename, '-s',
                cfg['general']['short_name'] + ':manual', '-o', json_identities
            ]
            if not cfg['sortinghat']['strict_mapping']:
                cmd += ['--no-email-validation']
            if self.__execute_command(cmd) != 0:
                logger.error('Can not generate the SH JSON file from '
                             'GrimoireLab yaml file. Do the files exists? '
                             'Is the API token right?')
            else:
                # Load the JSON file in SH format
                load_identities_file(json_identities,
                                     cfg['sortinghat']['reset_on_load'])

                # Closing tmp files so they are removed for the remote case
                if is_remote(identities_url):
                    identities.close()

                os.remove(json_identities)

        # ** START SYNC LOGIC **
        # Check that enrichment tasks are not active before loading identities
        while True:
            time.sleep(
                1)  # check each second if the identities load could start
            with TasksManager.IDENTITIES_TASKS_ON_LOCK:
                with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK:
                    enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON
                    logger.debug("[load identities] Enrich tasks active: %i",
                                 enrich_tasks)
                    if enrich_tasks == 0:
                        # The load of identities can be started
                        TasksManager.IDENTITIES_TASKS_ON = True
                        break
        #  ** END SYNC LOGIC **

        cfg = self.config.get_conf()

        # code = 0 when command success
        code = Init(**self.sh_kwargs).run(self.db_sh, '--reuse')

        # Basic loading of organizations from a SH JSON file. Legacy stuff.
        if 'load_orgs' in cfg['sortinghat'] and cfg['sortinghat']['load_orgs']:
            if 'orgs_file' not in cfg[
                    'sortinghat'] or not cfg['sortinghat']['orgs_file']:
                logger.error("Load orgs active but no orgs_file configured")
            elif not os.path.exists(cfg['sortinghat']['orgs_file']):
                logger.error("Orgs file not found on disk")
            else:
                orgs_file = cfg['sortinghat']['orgs_file']
                orgs_file_hash = get_file_hash(orgs_file)
                if not self.current_orgs_file_hash or self.current_orgs_file_hash != orgs_file_hash:
                    logger.info("[sortinghat] Loading orgs from file %s",
                                orgs_file)
                    code = Load(**self.sh_kwargs).run("--orgs", orgs_file)
                    if code != CMD_SUCCESS:
                        logger.error("[sortinghat] Error loading %s",
                                     orgs_file)

                    self.current_orgs_file_hash = orgs_file_hash
                    with open(orgs_file, 'r') as f:
                        json_content = json.loads(f.read())
                    logger.info("[sortinghat] %s organizations loaded",
                                len(json_content['organizations']))
                else:
                    logger.info(
                        "[sortinghat] No changes in file %s, organizations won't be loaded",
                        orgs_file)

        # Identities loading from files. It could be in several formats.
        # Right now GrimoireLab and SortingHat formats are supported
        if 'identities_file' in cfg['sortinghat']:
            try:
                if cfg['sortinghat']['identities_format'] == 'sortinghat':
                    load_sortinghat_identities(self.config)
                elif cfg['sortinghat']['identities_format'] == 'grimoirelab':
                    load_grimoirelab_identities(self.config)
            except Exception:
                with TasksManager.IDENTITIES_TASKS_ON_LOCK:
                    TasksManager.IDENTITIES_TASKS_ON = False
                raise

            # If one of the identities file has changed, after loading the identities
            # we need to unify in order to mix the identities loaded with then ones
            # from data sources.
            unify = any([
                v['has_changed']
                for v in self.current_identities_files_hash.values()
            ])
            if unify:
                cmd = [
                    'sortinghat', '-u', self.db_user, '-p', self.db_password,
                    '--host', self.db_host, '-d', self.db_sh
                ]
                cmd += ['unify', '--fast-matching']
                for algo in cfg['sortinghat']['matching']:
                    ucmd = cmd + ['-m', algo]
                    if not cfg['sortinghat']['strict_mapping']:
                        ucmd += ['--no-strict-matching']
                    logger.debug("Doing unify after identities load")
                    self.__execute_command(ucmd)

        with TasksManager.IDENTITIES_TASKS_ON_LOCK:
            TasksManager.IDENTITIES_TASKS_ON = False