示例#1
0
    def test_add_aliases_duplicated(self):
        """Test whether an alias isn't added when already present in a given index"""

        elastic = ElasticSearch(self.es_con,
                                self.target_index,
                                GitOcean.mapping,
                                aliases=['A', 'B', 'C'])

        expected_aliases = {'A': {}, 'B': {}, 'C': {}}
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)

        elastic.add_alias('C')
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)
示例#2
0
    def test_add_aliases(self):
        """Test whether an alias is added to a given index"""

        elastic = ElasticSearch(self.es_con,
                                self.target_index,
                                GitOcean.mapping,
                                aliases=['A', 'B', 'C'])

        expected_aliases = {'A': {}, 'B': {}, 'C': {}}
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)

        expected_aliases = {'A': {}, 'B': {}, 'C': {}, 'D': {}}
        elastic.add_alias('D')
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)
示例#3
0
    def enrich_cocom_analysis(self,
                              ocean_backend,
                              enrich_backend,
                              no_incremental=False,
                              out_index="cocom_enrich_graal_repo",
                              interval_months=[3],
                              date_field="grimoire_creation_date"):

        logger.info("[cocom] study enrich-cocom-analysis start")

        es_in = ES([enrich_backend.elastic_url],
                   retry_on_timeout=True,
                   timeout=100,
                   verify_certs=self.elastic.requests.verify,
                   connection_class=RequestsHttpConnection)
        in_index = enrich_backend.elastic.index
        interval_months = list(map(int, interval_months))

        unique_repos = es_in.search(index=in_index,
                                    body=get_unique_repository())

        repositories = [
            repo['key']
            for repo in unique_repos['aggregations']['unique_repos'].get(
                'buckets', [])
        ]
        current_month = datetime_utcnow().replace(day=1,
                                                  hour=0,
                                                  minute=0,
                                                  second=0)

        logger.info(
            "[cocom] study enrich-cocom-analysis {} repositories to process".
            format(len(repositories)))
        es_out = ElasticSearch(enrich_backend.elastic.url,
                               out_index,
                               mappings=Mapping)
        es_out.add_alias("cocom_study")

        num_items = 0
        ins_items = 0

        for repository_url in repositories:
            logger.info(
                "[cocom] study enrich-cocom-analysis start analysis for {}".
                format(repository_url))
            evolution_items = []

            for interval in interval_months:

                to_month = get_to_date(es_in, in_index, out_index,
                                       repository_url, interval)
                to_month = to_month.replace(month=int(interval),
                                            day=1,
                                            hour=0,
                                            minute=0,
                                            second=0)

                while to_month < current_month:
                    files_at_time = es_in.search(
                        index=in_index,
                        body=get_files_at_time(repository_url,
                                               to_month.isoformat())
                    )['aggregations']['file_stats'].get("buckets", [])

                    if not len(files_at_time):
                        to_month = to_month + relativedelta(months=+interval)
                        continue

                    repository_name = repository_url.split("/")[-1]
                    evolution_item = {
                        "id":
                        "{}_{}_{}".format(to_month.isoformat(),
                                          repository_name, interval),
                        "origin":
                        repository_url,
                        "interval_months":
                        interval,
                        "study_creation_date":
                        to_month.isoformat(),
                        "total_files":
                        len(files_at_time)
                    }

                    for file_ in files_at_time:
                        file_details = file_["1"]["hits"]["hits"][0]["_source"]

                        for metric in self.metrics:
                            total_metric = "total_" + metric
                            evolution_item[total_metric] = evolution_item.get(
                                total_metric, 0)
                            evolution_item[total_metric] += file_details[
                                metric] if file_details[
                                    metric] is not None else 0

                    # TODO: Fix Logic: None rather than 1
                    evolution_item["total_comments_per_loc"] = round(
                        evolution_item["total_comments"] /
                        max(evolution_item["total_loc"], 1), 2)
                    evolution_item["total_blanks_per_loc"] = round(
                        evolution_item["total_blanks"] /
                        max(evolution_item["total_loc"], 1), 2)
                    evolution_item["total_loc_per_function"] = round(
                        evolution_item["total_loc"] /
                        max(evolution_item["total_num_funs"], 1), 2)

                    evolution_item.update(
                        self.get_grimoire_fields(
                            evolution_item["study_creation_date"], "stats"))
                    evolution_items.append(evolution_item)

                    if len(evolution_items) >= self.elastic.max_items_bulk:
                        num_items += len(evolution_items)
                        ins_items += es_out.bulk_upload(
                            evolution_items, self.get_field_unique_id())
                        evolution_items = []

                    to_month = to_month + relativedelta(months=+interval)

                if len(evolution_items) > 0:
                    num_items += len(evolution_items)
                    ins_items += es_out.bulk_upload(evolution_items,
                                                    self.get_field_unique_id())

                if num_items != ins_items:
                    missing = num_items - ins_items
                    logger.error(
                        "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis "
                        "Study".format(missing, num_items))
                else:
                    logger.info(
                        "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis "
                        "Study".format(num_items))

            logger.info(
                "[cocom] study enrich-cocom-analysis End analysis for {} with month interval"
                .format(repository_url))

        logger.info("[cocom] study enrich-cocom-analysis End")
示例#4
0
    def enrich_backlog_analysis(self,
                                ocean_backend,
                                enrich_backend,
                                no_incremental=False,
                                out_index="github_enrich_backlog",
                                date_field="grimoire_creation_date",
                                interval_days=1,
                                reduced_labels=["bug"],
                                map_label=["others", "bugs"]):
        """
        The purpose of this study is to add additional index to compute the
        chronological evolution of opened issues and average opened time issues.

        For each repository and label, we start the study on repository
        creation date until today with a day interval (default). For each date
        we retrieve the number of open issues at this date by difference between
        number of opened issues and number of closed issues. In addition, we
        compute the average opened time for all issues open at this date.

        To differentiate by label, we compute evolution for bugs and all others
        labels (like "enhancement","good first issue" ... ), we call this
        "reduced labels". We need to use theses reduced labels because the
        complexity to compute evolution for each combination of labels would be
        too big. In addition, we can rename "bug" label to "bugs" with map_label.

        Entry example in setup.cfg :

        [github]
        raw_index = github_issues_raw
        enriched_index = github_issues_enriched
        ...
        studies = [enrich_backlog_analysis]

        [enrich_backlog_analysis]
        out_index = github_enrich_backlog
        interval_days = 7
        reduced_labels = [bug,enhancement]
        map_label = [others, bugs, enhancements]

        """

        logger.info("[github] Start enrich_backlog_analysis study")

        # combine two lists to create the dict to map labels
        map_label = dict(zip([""] + reduced_labels, map_label))

        # connect to ES
        es_in = ES([enrich_backend.elastic_url],
                   retry_on_timeout=True,
                   timeout=100,
                   verify_certs=self.elastic.requests.verify,
                   connection_class=RequestsHttpConnection)
        in_index = enrich_backend.elastic.index

        # get all repositories
        unique_repos = es_in.search(
            index=in_index, body=get_unique_repository_with_project_name())
        repositories = [
            repo['key']
            for repo in unique_repos['aggregations']['unique_repos'].get(
                'buckets', [])
        ]

        logger.debug(
            "[enrich-backlog-analysis] {} repositories to process".format(
                len(repositories)))

        # create the index
        es_out = ElasticSearch(enrich_backend.elastic.url,
                               out_index,
                               mappings=Mapping)
        es_out.add_alias("backlog_study")

        # analysis for each repositories
        num_items = 0
        ins_items = 0
        for repository in repositories:
            repository_url = repository["origin"]
            project = repository["project"]
            org_name = repository["organization"]
            repository_name = repository_url.split("/")[-1]

            logger.debug(
                "[enrich-backlog-analysis] Start analysis for {}".format(
                    repository_url))

            # get each day since repository creation
            dates = es_in.search(index=in_index,
                                 body=get_issues_dates(interval_days,
                                                       repository_url)
                                 )['aggregations']['created_per_interval'].get(
                                     "buckets", [])

            # for each selected label + others labels
            for label, other in [("", True)] + [(l, False)
                                                for l in reduced_labels]:
                # compute metrics for each day (ES request for each day)
                evolution_items = []
                for date in map(lambda i: i['key_as_string'], dates):
                    evolution_item = self.__create_backlog_item(
                        repository_url, repository_name, project, date,
                        org_name, interval_days, label, map_label,
                        self.__get_opened_issues(es_in, in_index,
                                                 repository_url, date,
                                                 interval_days, other, label,
                                                 reduced_labels))
                    evolution_items.append(evolution_item)

                # complete until today (no ES request needed, just extrapol)
                today = datetime.now().replace(hour=0,
                                               minute=0,
                                               second=0,
                                               tzinfo=None)
                last_item = evolution_item
                last_date = str_to_datetime(
                    evolution_item['study_creation_date']).replace(tzinfo=None) \
                    + relativedelta(days=interval_days)
                average_opened_time = evolution_item['average_opened_time'] \
                    + float(interval_days)
                while last_date < today:
                    date = last_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')
                    evolution_item = {}
                    evolution_item.update(last_item)
                    evolution_item.update({
                        "average_opened_time":
                        average_opened_time,
                        "study_creation_date":
                        date,
                        "uuid":
                        "{}_{}_{}".format(date, repository_name, label),
                    })
                    evolution_item.update(
                        self.get_grimoire_fields(date, "stats"))
                    evolution_items.append(evolution_item)
                    last_date = last_date + relativedelta(days=interval_days)
                    average_opened_time = average_opened_time + float(
                        interval_days)

                # upload items to ES
                if len(evolution_items) > 0:
                    num_items += len(evolution_items)
                    ins_items += es_out.bulk_upload(evolution_items,
                                                    self.get_field_unique_id())

                if num_items != ins_items:
                    missing = num_items - ins_items
                    logger.error(
                        ("[enrich-backlog-analysis] %s/%s missing items",
                         "for Graal Backlog Analysis Study"), str(missing),
                        str(num_items))
                else:
                    logger.debug(
                        ("[enrich-backlog-analysis] %s items inserted",
                         "for Graal Backlog Analysis Study"), str(num_items))

        logger.info("[github] End enrich_backlog_analysis study")
示例#5
0
    def enrich_colic_analysis(self,
                              ocean_backend,
                              enrich_backend,
                              no_incremental=False,
                              out_index="colic_enrich_graal_repo",
                              interval_months=[3],
                              date_field="grimoire_creation_date"):

        logger.info("[colic] study enrich-colic-analysis start")

        es_in = ES([enrich_backend.elastic_url],
                   retry_on_timeout=True,
                   timeout=100,
                   verify_certs=self.elastic.requests.verify,
                   connection_class=RequestsHttpConnection)
        in_index = enrich_backend.elastic.index
        interval_months = list(map(int, interval_months))

        unique_repos = es_in.search(index=in_index,
                                    body=get_unique_repository())

        repositories = [
            repo['key']
            for repo in unique_repos['aggregations']['unique_repos'].get(
                'buckets', [])
        ]

        logger.info(
            "[colic] study enrich-colic-analysis {} repositories to process".
            format(len(repositories)))
        es_out = ElasticSearch(enrich_backend.elastic.url,
                               out_index,
                               mappings=Mapping)
        es_out.add_alias("colic_study")

        current_month = datetime_utcnow().replace(day=1,
                                                  hour=0,
                                                  minute=0,
                                                  second=0)
        num_items = 0
        ins_items = 0

        for repository_url in repositories:
            repository_url_anonymized = repository_url
            if repository_url_anonymized.startswith('http'):
                repository_url_anonymized = anonymize_url(
                    repository_url_anonymized)

            logger.info(
                "[colic] study enrich-colic-analysis start analysis for {}".
                format(repository_url_anonymized))
            evolution_items = []

            for interval in interval_months:

                to_month = get_to_date(es_in, in_index, out_index,
                                       repository_url, interval)
                to_month = to_month.replace(month=int(interval),
                                            day=1,
                                            hour=0,
                                            minute=0,
                                            second=0)

                while to_month < current_month:
                    copyrighted_files_at_time = es_in.search(
                        index=in_index,
                        body=self.__get_copyrighted_files(
                            repository_url, to_month.isoformat()))

                    licensed_files_at_time = es_in.search(
                        index=in_index,
                        body=self.__get_licensed_files(repository_url,
                                                       to_month.isoformat()))

                    files_at_time = es_in.search(index=in_index,
                                                 body=self.__get_total_files(
                                                     repository_url,
                                                     to_month.isoformat()))

                    licensed_files = int(
                        licensed_files_at_time["aggregations"]["1"]["value"])
                    copyrighted_files = int(
                        copyrighted_files_at_time["aggregations"]["1"]
                        ["value"])
                    total_files = int(
                        files_at_time["aggregations"]["1"]["value"])

                    if not total_files:
                        to_month = to_month + relativedelta(months=+interval)
                        continue

                    evolution_item = {
                        "id":
                        "{}_{}_{}".format(to_month.isoformat(),
                                          hash(repository_url_anonymized),
                                          interval),
                        "repo_url":
                        repository_url_anonymized,
                        "origin":
                        repository_url,
                        "interval_months":
                        interval,
                        "study_creation_date":
                        to_month.isoformat(),
                        "licensed_files":
                        licensed_files,
                        "copyrighted_files":
                        copyrighted_files,
                        "total_files":
                        total_files
                    }

                    evolution_item.update(
                        self.get_grimoire_fields(
                            evolution_item["study_creation_date"], "stats"))
                    evolution_items.append(evolution_item)

                    if len(evolution_items) >= self.elastic.max_items_bulk:
                        num_items += len(evolution_items)
                        ins_items += es_out.bulk_upload(
                            evolution_items, self.get_field_unique_id())
                        evolution_items = []

                    to_month = to_month + relativedelta(months=+interval)

                if len(evolution_items) > 0:
                    num_items += len(evolution_items)
                    ins_items += es_out.bulk_upload(evolution_items,
                                                    self.get_field_unique_id())

                if num_items != ins_items:
                    missing = num_items - ins_items
                    logger.error(
                        "[colic] study enrich-colic-analysis {}/{} missing items for Graal CoLic Analysis "
                        "Study".format(missing, num_items))
                else:
                    logger.info(
                        "[colic] study enrich-colic-analysis {} items inserted for Graal CoLic Analysis "
                        "Study".format(num_items))

            logger.info(
                "[colic] study enrich-colic-analysis end analysis for {} with month interval"
                .format(repository_url_anonymized))

        logger.info("[colic] study enrich-colic-analysis end")