Exemplo n.º 1
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    with Explanation("Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, debug=DAEMON_DEBUG):
                        rev = self.get_revision(Revision(branch=branch, changeset={"id": r}))
                        if DAEMON_DEBUG:
                            Log.note("found revision with push date {{date|datetime}}", date=rev.push.date)
                        revisions.discard(r)

                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Exemplo n.º 2
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        excludes = listwrap(self.settings.exclude)
        self.settings.exclude = set(e for e in excludes
                                    if len(split_field(e)) == 1)
        self.settings.exclude_columns = set(p for e in excludes
                                            for p in [tuple(split_field(e))]
                                            if len(p) > 1)
        self.settings.exclude_path = list(
            map(split_field, listwrap(self.settings.exclude_path)))
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)
        self.name_relations = unwrap(coalesce(self.settings.name_relations,
                                              {}))

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            self.settings.database.schema = self.db.settings.schema
            with self.db.transaction():
                self._scan_database()
        if not self.settings.database.schema:
            Log.error("you must provide a `database.schema`")
Exemplo n.º 3
0
def _get_url(url, branch, **kwargs):
    with Explanation("get push from {{url}}", url=url, debug=DEBUG):
        response = http.get(url, **kwargs)
        data = json2value(response.content.decode("utf8"))
        if isinstance(data, (text_type, str)) and data.startswith("unknown revision"):
            Log.error("Unknown push {{revision}}", revision=strings.between(data, "'", "'"))
        branch.url = _trim(url)  # RECORD THIS SUCCESS IN THE BRANCH
        return data
Exemplo n.º 4
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions, after = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(
                            Revision(branch=branch, changeset={"id": r}),
                            None,  # local
                            False,  # get_diff
                            True,  # get_moves
                        )
                        if after and after > rev.etl.timestamp:
                            rev = self._get_from_hg(revision=rev)

                        if DAEMON_DEBUG:
                            Log.note(
                                "found revision with push date {{date|datetime}}",
                                date=rev.push.date,
                            )
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now() - (
                                DAEMON_RECENT_HG_PULL * SECOND):
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(DAEMON_HG_INTERVAL *
                                                      2)).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e,
                        )
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait()

                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Exemplo n.º 5
0
def _get_url(url, branch, **kwargs):
    with Explanation("get push from {{url}}", url=url, debug=DEBUG):
        response = http.get(url, **kwargs)
        data = json2value(response.content.decode("utf8"))
        if data.error.startswith("unknown revision"):
            Log.error(UNKNOWN_PUSH,
                      revision=strings.between(data.error, "'", "'"))
        if is_text(data) and data.startswith("unknown revision"):
            Log.error(UNKNOWN_PUSH, revision=strings.between(data, "'", "'"))
        # branch.url = _trim(url)  # RECORD THIS SUCCESS IN THE BRANCH
        return data
Exemplo n.º 6
0
    def _get_push(self, branch, changeset_id):
        if self.es.cluster.version.startswith("1.7."):
            query = {
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"term": {"branch.name": branch.name}},
                        {"prefix": {"changeset.id": changeset_id[0:12]}}
                    ]}
                }},
                "size": 1
            }
        else:
            query = {
                "query": {"bool": {"must": [
                    {"term": {"branch.name": branch.name}},
                    {"prefix": {"changeset.id": changeset_id[0:12]}}
                ]}},
                "size": 1
            }

        try:
            # ALWAYS TRY ES FIRST
            with self.es_locker:
                response = self.es.search(query)
                json_push = response.hits.hits[0]._source.push
            if json_push:
                return json_push
        except Exception:
            pass

        url = branch.url.rstrip("/") + "/json-pushes?full=1&changeset=" + changeset_id
        with Explanation("Pulling pushlog from {{url}}", url=url, debug=DEBUG):
            Log.note(
                "Reading pushlog from {{url}}",
                url=url,
                changeset=changeset_id
            )
            data = self._get_and_retry(url, branch)
            # QUEUE UP THE OTHER CHANGESETS IN THE PUSH
            self.todo.add((branch, [c.node for cs in data.values().changesets for c in cs]))
            pushes = [
                Push(id=int(index), date=_push.date, user=_push.user)
                for index, _push in data.items()
            ]

        if len(pushes) == 0:
            return Null
        elif len(pushes) == 1:
            return pushes[0]
        else:
            Log.error("do not know what to do")
Exemplo n.º 7
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.settings.exclude = set(self.settings.exclude)
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            with self.db:
                with self.db.transaction():
                    self._scan_database()
Exemplo n.º 8
0
    def __init__(
            self,
            hg=None,  # CONNECT TO hg
            repo=None,  # CONNECTION INFO FOR ES CACHE
            branches=None,  # CONNECTION INFO FOR ES CACHE
            use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
            timeout=30 * SECOND,
            kwargs=None):
        if not _hg_branches:
            _late_imports()

        self.es_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)

        self.settings = kwargs
        self.timeout = Duration(timeout)

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            response = http.head(self.settings.hg.url)

        if branches == None:
            self.branches = _hg_branches.get_branches(kwargs=kwargs)
            self.es = None
            return

        self.last_cache_miss = Date.now()

        set_default(repo, {"schema": revision_schema})
        self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(
            kwargs=repo)

        def setup_es(please_stop):
            with suppress_exception:
                self.es.add_alias()

            with suppress_exception:
                self.es.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        self.timeout = timeout
        Thread.run("hg daemon", self._daemon)
Exemplo n.º 9
0
    def get_revision(self, revision, locale=None, get_diff=False):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if DEBUG:
                Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.error("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale)

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch, found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            return output
Exemplo n.º 10
0
    def __init__(
        self,
        hg=None,  # CONNECT TO hg
        repo=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        timeout=30 * SECOND,
        kwargs=None,
    ):
        if not _hg_branches:
            _late_imports()

        if not is_text(repo.index):
            Log.error("Expecting 'index' parameter")
        self.repo_locker = Lock()
        self.moves_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)
        self.settings = kwargs
        self.timeout = Duration(timeout)
        self.last_cache_miss = Date.now()

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            http.head(self.settings.hg.url)

        set_default(repo, {
            "type": "revision",
            "schema": revision_schema,
        })
        kwargs.branches = set_default(
            {
                "index": repo.index + "-branches",
                "type": "branch",
            },
            repo,
        )
        moves = set_default(
            {
                "index": repo.index + "-moves",
            },
            repo,
        )

        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        cluster = elasticsearch.Cluster(kwargs=repo)
        self.repo = cluster.get_or_create_index(kwargs=repo)
        self.moves = cluster.get_or_create_index(kwargs=moves)

        def setup_es(please_stop):
            with suppress_exception:
                self.repo.add_alias()
            with suppress_exception:
                self.moves.add_alias()

            with suppress_exception:
                self.repo.set_refresh_interval(seconds=1)
            with suppress_exception:
                self.moves.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        Thread.run("hg daemon", self._daemon)
Exemplo n.º 11
0
    def get_revision(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True,
                     after=None):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision,
                                              locale=locale,
                                              get_diff=get_diff,
                                              get_moves=get_moves,
                                              after=after)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note(
                "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES",
                branch=output.branch.name,
                locale=locale,
                revision=output.changeset.id,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]

        url1 = found_revision.branch.url.rstrip(
            "/") + "/json-info?node=" + id12
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + id12
        url3 = (found_revision.branch.url.rstrip("/") +
                "/json-automationrelevance/" + id12)
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
                automation_details = self._get_raw_json_rev(
                    url3, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
                self.todo.add((output.branch, listwrap(output.backsoutnodes),
                               output.push.date))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Exemplo n.º 12
0
    def _get_from_hg(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True):
        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        # CLEAN UP BRANCH NAME
        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        # REFRESH BRANCHES, IF TOO OLD
        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        # FIND THE PUSH
        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]
        base_url = URL(found_revision.branch.url)

        with Explanation("get revision from {{url}}",
                         url=base_url,
                         debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info((base_url / "json-info") +
                                                   {"node": id12})
                raw_rev2 = self._get_raw_json_rev(base_url / "json-rev" / id12)
                automation_details = self._get_raw_json_rev(
                    base_url / "json-automationrelevance" / id12)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.extend([
                    (output.branch, listwrap(output.parents), None),
                    (output.branch, listwrap(output.children), None),
                    (
                        output.branch,
                        listwrap(output.backsoutnodes),
                        output.push.date,
                    ),
                ])

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None

        return output