def execute(self):

        errors = []
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if 'collect' in cfg[self.backend_section] and not cfg[
                self.backend_section]['collect']:
            logging.info('%s collect disabled', self.backend_section)
            return errors

        t2 = time.time()
        logger.info('[%s] collection phase starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if 'fetch-archive' in cfg[self.backend_section] and cfg[
                self.backend_section]['fetch-archive']:
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists it means that there is an equivalent URL
                # in the `unknown` section of the projects.json. Thus the URL with
                # filter-raw is ignored in the collection phase, while the URL
                # in `unknown` is considered in this phase.
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.info('[%s] collection starts for %s', self.backend_section,
                        repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron

            es_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                error_msg = feed_backend(es_col_url,
                                         clean,
                                         fetch_archive,
                                         backend,
                                         backend_args,
                                         cfg[ds]['raw_index'],
                                         cfg[ds]['enriched_index'],
                                         project,
                                         es_aliases=es_aliases,
                                         projects_json_repo=repo,
                                         repo_labels=repo_labels)
                error = {'backend': backend, 'repo': repo, 'error': error_msg}

                errors.append(error)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)
            logger.info('[%s] collection finished for %s',
                        self.backend_section, repo)

        t3 = time.time()
        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] collection phase finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))

        self.retain_data(cfg['general']['retention_time'],
                         self.conf['es_collection']['url'],
                         self.conf[self.backend_section]['raw_index'])

        return errors
    def execute(self):
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if ('fetch-archive' in cfg[self.backend_section]
                and cfg[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_section,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron
            try:
                feed_backend(es_col_url, clean, fetch_archive, backend,
                             backend_args, cfg[ds]['raw_index'],
                             cfg[ds]['enriched_index'], project)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)

        t3 = time.time()

        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))