def get_pending(es, since):
    result = es.search({
        "query": {"filtered": {
            "query": {"match_all": {}},
            "filter": {
            "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}}
        }},
        "from": 0,
        "size": 0,
        "sort": [],
        "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}}
    })

    if len(result.facets.default.terms) >= 200000:
        Log.error("Can not handle more than 200K bugs changed")

    pending_bugs = Multiset(
        result.facets.default.terms,
        key_field="term",
        count_field="count"
    )
    Log.note("Source has {{num}} bug versions for updating", {
        "num": len(pending_bugs)
    })
    return pending_bugs
    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir +
                               "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [
                    b for b in
                    [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                    if b not in private_bugs
                ]

                Log.note("Test with the following bug_ids: {{bugs}}",
                         {"bugs": some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference,
                                                self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning(
                        "Total failure during compare of bugs {{bugs}}",
                        {"bugs": some_bugs}, e)
示例#3
0
def incremental_etl(settings, param, db, es, es_comments, output_queue):
    ####################################################################
    ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE
    ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS
    ####################################################################

    #REMOVE PRIVATE BUGS
    private_bugs = get_private_bugs_for_delete(db, param)
    Log.note(
        "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}",
        {"private_bugs": private_bugs})
    for g, delete_bugs in Q.groupby(private_bugs, size=1000):
        still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
        if still_existing:
            Log.note(
                "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}",
                {"private_bugs": still_existing})
        es.delete_record({"terms": {"bug_id": delete_bugs}})
        es_comments.delete_record({"terms": {"bug_id": delete_bugs}})

    #RECENT PUBLIC BUGS
    possible_public_bugs = get_recent_private_bugs(db, param)
    if param.allow_private_bugs:
        #PRIVATE BUGS
        #    A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO
        es.delete_record({"terms": {"bug_id": possible_public_bugs}})
    else:
        #PUBLIC BUGS
        #    IF ADDING GROUP THEN private_bugs ALREADY DID THIS
        #    IF REMOVING GROUP THEN NO RECORDS TO DELETE
        pass

    #REMOVE **RECENT** PRIVATE ATTACHMENTS
    private_attachments = get_recent_private_attachments(db, param)
    bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
    es.delete_record({"terms": {"bug_id": bugs_to_refresh}})

    #REBUILD BUGS THAT GOT REMOVED
    bug_list = (possible_public_bugs
                | bugs_to_refresh) - private_bugs  # REMOVE PRIVATE BUGS
    if bug_list:
        refresh_param = param.copy()
        refresh_param.bug_list = bug_list
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db,
                         es_comments,
                         refresh_param.copy(),
                         please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}",
                      {"parameters": refresh_param}, e)
    def extend(self, records):
        """
        JUST SO WE MODEL A Queue
        """
        records = {v["id"]: v["value"] for v in records}

        struct.unwrap(self.data).update(records)

        data_as_json = CNV.object2JSON(self.data, pretty=True)

        File(self.filename).write(data_as_json)
        Log.note("{{num}} items added", {"num": len(records)})
示例#5
0
def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })
        return ElasticSearch(settings)
def loadAliases(settings):
    try:
        try:
            with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)})
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4      #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([
            {"email": e, "count": c}
            for e, c in problem_agg.dic.iteritems()
            if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)
        ], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(problem.email, 0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])

            if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                "problem": problem.email,
                "score": problem.count,
                "solution": best_solution.email,
                "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
            })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
示例#8
0
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
示例#9
0
def etl_comments(db, es, param, please_stop):
    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with comment_db_cache_lock:
        if not comment_db_cache:
            comment_db = DB(db)
            comment_db_cache.append(comment_db)

    with comment_db_cache_lock:
        Log.note("Read comments from database")
        comments = get_comments(comment_db_cache[0], param)

    for g, c in Q.groupby(comments, size=500):
        with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
            es.extend({"id": cc.comment_id, "value": cc} for cc in c)
示例#10
0
def incremental_etl(settings, param, db, es, es_comments, output_queue):
    ####################################################################
    ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE
    ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS
    ####################################################################

    #REMOVE PRIVATE BUGS
    private_bugs = get_private_bugs_for_delete(db, param)
    Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs})
    for g, delete_bugs in Q.groupby(private_bugs, size=1000):
        still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
        if still_existing:
            Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing})
        es.delete_record({"terms": {"bug_id": delete_bugs}})
        es_comments.delete_record({"terms": {"bug_id": delete_bugs}})


    #RECENT PUBLIC BUGS
    possible_public_bugs = get_recent_private_bugs(db, param)
    if param.allow_private_bugs:
        #PRIVATE BUGS
        #    A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO
        es.delete_record({"terms": {"bug_id": possible_public_bugs}})
    else:
        #PUBLIC BUGS
        #    IF ADDING GROUP THEN private_bugs ALREADY DID THIS
        #    IF REMOVING GROUP THEN NO RECORDS TO DELETE
        pass

    #REMOVE **RECENT** PRIVATE ATTACHMENTS
    private_attachments = get_recent_private_attachments(db, param)
    bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
    es.delete_record({"terms": {"bug_id": bugs_to_refresh}})

    #REBUILD BUGS THAT GOT REMOVED
    bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS
    if bug_list:
        refresh_param = param.copy()
        refresh_param.bug_list = bug_list
        refresh_param.start_time = 0
        refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0)

        try:
            etl(db, output_queue, refresh_param.copy(), please_stop=None)
            etl_comments(db, es_comments, refresh_param.copy(), please_stop=None)
        except Exception, e:
            Log.error("Problem with etl using parameters {{parameters}}", {
                "parameters": refresh_param
            }, e)
示例#11
0
    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir + "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]

                Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference, self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })

        ElasticSearch.delete_index(settings)

        schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True)
        es = ElasticSearch.create_index(settings, schema, limit_replicas=True)
        return es
def add_alias(lost, found):
    found_record = aliases.get(found, None)
    lost_record = aliases.get(lost, None)

    new_canonical = found
    old_canonical = nvl(lost_record.canonical, lost)
    lost_record.canonical = new_canonical

    delete_list = []

    #FOLD bugs ON lost=found
    for bug_id, agg in bugs.iteritems():
        v = agg.dic.get(lost, 0)
        if v != 0:
            agg.add(lost, -v)
            agg.add(found, v)

        if not agg:
            delete_list.append(bug_id)

    #FOLD bugs ON old_canonical=new_canonical
    if old_canonical != lost:
        for bug_id, agg in bugs.iteritems():
            v = agg.dic.get(old_canonical, 0)
            if v != 0:
                agg.add(old_canonical, -v)
                agg.add(new_canonical, v)

            if not agg:
                delete_list.append(bug_id)

    for d in delete_list:
        del bugs[d]

    #FOLD ALIASES
    for k, v in aliases.iteritems():
        if v.canonical == old_canonical:
            Log.note(
                "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", {
                    "alias": k,
                    "old": old_canonical,
                    "new": found
                })
            v.canonical = found
示例#14
0
def add_alias(lost, found):
    found_record = aliases.get(found, None)
    lost_record = aliases.get(lost, None)

    new_canonical = found
    old_canonical = nvl(lost_record.canonical, lost)
    lost_record.canonical = new_canonical

    delete_list = []

    #FOLD bugs ON lost=found
    for bug_id, agg in bugs.iteritems():
        v = agg.dic.get(lost, 0)
        if v != 0:
            agg.add(lost, -v)
            agg.add(found, v)

        if not agg:
            delete_list.append(bug_id)

    #FOLD bugs ON old_canonical=new_canonical
    if old_canonical != lost:
        for bug_id, agg in bugs.iteritems():
            v = agg.dic.get(old_canonical, 0)
            if v != 0:
                agg.add(old_canonical, -v)
                agg.add(new_canonical, v)

            if not agg:
                delete_list.append(bug_id)

    for d in delete_list:
        del bugs[d]

    #FOLD ALIASES
    for k, v in aliases.iteritems():
        if v.canonical == old_canonical:
            Log.note("ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", {
                "alias": k,
                "old": old_canonical,
                "new": found
            })
            v.canonical = found
示例#15
0
    def test_private_bugs_do_not_show(self):
        self.settings.param.allow_private_bugs = False
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))
        Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_no_private_bugs(es, private_bugs)
示例#16
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
示例#17
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [
        {
            "lost": n,
            "found": d.canonical
        }
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical
    ]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
示例#18
0
def make_test_instance(db_settings):
    if not db_settings.filename:
        Log.note("Database schema will not be touched")
        return

    with Timer("Make database instance"):
        try:
            #CLEAR SCHEMA
            Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema})
            no_schema=db_settings.copy()
            no_schema.schema = None
            with DB(no_schema) as db:
                db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)})
                db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)})

            #FILL SCHEMA
            Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema})
            DB.execute_file(db_settings, db_settings.filename)

            #ADD MISSING TABLES
            with DB(db_settings) as db:
                db.execute("""
                CREATE TABLE `longdescs_tags` (
                  `id` mediumint(9) NOT NULL AUTO_INCREMENT,
                  `comment_id` int(11) DEFAULT NULL,
                  `tag` varchar(24) NOT NULL,
                  PRIMARY KEY (`id`),
                  UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`),
                  CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE
                ) DEFAULT CHARSET=utf8""")
        except Exception, e:
            Log.error("Can not setup test database", e)
示例#19
0
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {
                "start": s,
                "end": e
            })
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
def loadAliases(settings):
    try:
        try:
            with Timer(
                    "load alias file at {{filename}}", {
                        "filename":
                        nvl(settings.param.alias_file.path,
                            settings.param.alias_file)
                    }):
                alias_json = File(settings.param.alias_file).read()
        except Exception, e:
            Log.warning(
                "No alias file found (looking at {{filename}}", {
                    "filename":
                    nvl(settings.param.alias_file.path,
                        settings.param.alias_file)
                })
            alias_json = "{}"
            #self.aliases IS A dict POINTING TO structs
        for k, v in CNV.JSON2object(alias_json).iteritems():
            aliases[k] = struct.wrap(v)

        Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def main(settings, bug_list=None, please_stop=None, restart=False):
    """
    THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE.
    SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME.  THIS CODE
    WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED
    OVER THE LIFETIME OF THE BUGZILLA DATA.  'PERSON' IS ABSTRACT, AND SIMPLY
    ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION
    """
    if settings.args.quick:
        Log.note("Alias analysis skipped (--quick was used)")
        return

    if not restart:
        loadAliases(settings)

    if bug_list:
        with DB(settings.bugzilla, readonly=True) as db:
            data = get_all_cc_changes(db, bug_list)
            aggregator(data)
            analysis(settings, True, please_stop)
        return

    with DB(settings.bugzilla, readonly=True) as db:
        start = nvl(settings.param.start, 0)
        end = nvl(settings.param.end,
                  db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)

        #Perform analysis on blocks of bugs, in case we crash partway through
        for s, e in Q.intervals(start, end, settings.param.alias_increment):
            Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e})
            data = get_all_cc_changes(db, range(s, e))
            if please_stop:
                break
            aggregator(data)

            analysis(settings, e >= end, please_stop)
    def test_private_bugs_do_not_show(self):
        self.settings.param.allow_private_bugs = False
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))
        Log.note("The private bugs for this test are {{bugs}}",
                 {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments",
                                                self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_no_private_bugs(es, private_bugs)
def main():
    """
    MEANT TO BE RUN JUST ONCE IN DEVELOPMENT TO CONVERT A BIG PUBLIC
    DATABASE (8G+) INTO A TINY TESTING DB (FOR ADDING TO REPOSITORY)
    """
    try:
        settings=startup.read_settings()
        Log.start(settings.debug)
        input=raw_input("We are going to totally wipe out the "+settings.bugzilla.schema.upper()+" schema at "+settings.bugzilla.host.upper()+"!  Type \"YES\" to continue: ")
        if input!="YES":
            Log.note("Aborted.  No Changes made.")
            return

        Log.note("Scrubbing db of those pesky records.")
        Log.note("This is going to take hours ...")

        DB.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", {
            "schema":settings.bugzilla.schema,
            "bug_list":SQL(settings.param.bugs)
        })
        Log.note("... Done!")
    finally:
        Log.stop()
示例#24
0
def make_test_instance(db_settings):
    if not db_settings.filename:
        Log.note("Database schema will not be touched")
        return

    with Timer("Make database instance"):
        try:
            #CLEAR SCHEMA
            Log.note("Make empty {{schema}} schema",
                     {"schema": db_settings.schema})
            no_schema = db_settings.copy()
            no_schema.schema = None
            with DB(no_schema) as db:
                db.execute("DROP DATABASE IF EXISTS {{schema}}",
                           {"schema": db.quote_column(db_settings.schema)})
                db.execute("CREATE DATABASE {{schema}}",
                           {"schema": db.quote_column(db_settings.schema)})

            #FILL SCHEMA
            Log.note("Fill {{schema}} schema with data",
                     {"schema": db_settings.schema})
            DB.execute_file(db_settings, db_settings.filename)

            #ADD MISSING TABLES
            with DB(db_settings) as db:
                db.execute("""
                CREATE TABLE `longdescs_tags` (
                  `id` mediumint(9) NOT NULL AUTO_INCREMENT,
                  `comment_id` int(11) DEFAULT NULL,
                  `tag` varchar(24) NOT NULL,
                  PRIMARY KEY (`id`),
                  UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`),
                  CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE
                ) DEFAULT CHARSET=utf8""")
        except Exception, e:
            Log.error("Can not setup test database", e)
    removed = set(old_aliases.keys()) - set(compressed.keys())
    common = set(compressed.keys()) & set(old_aliases.keys())

    changed = set()
    for c in common:
        if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(
                old_aliases[c], pretty=True):
            changed.add(c)

    if added or removed or changed:
        alias_json = CNV.object2JSON(compressed, pretty=True)
        file = File(settings.param.alias_file)
        file.write(alias_json)

        Log.note("{{num}} of {{total}} aliases saved", {
            "num": len(compressed.keys()),
            "total": len(aliases.keys())
        })


def start():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        main(settings, restart=True)
    except Exception, e:
        Log.error("Can not start", e)
    finally:
        Log.stop()


if __name__ == "__main__":
    def test_recent_private_stuff_does_not_show(self):
        self.settings.param.allow_private_bugs = False
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        database.make_test_instance(self.settings.bugzilla)

        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments",
                                                self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        #MARK SOME STUFF PRIVATE
        with DB(self.settings.bugzilla) as db:
            #BUGS
            private_bugs = set(Random.sample(self.settings.param.bugs, 3))
            Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs})
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

            #COMMENTS
            comments = db.query("SELECT comment_id FROM longdescs").comment_id
            marked_private_comments = Random.sample(comments, 5)
            for c in marked_private_comments:
                database.mark_comment_private(db, c, isprivate=1)

            #INCLUDE COMMENTS OF THE PRIVATE BUGS
            implied_private_comments = db.query(
                """
                SELECT comment_id FROM longdescs WHERE {{where}}
            """, {
                    "where":
                    esfilter2sqlwhere(db, {"terms": {
                        "bug_id": private_bugs
                    }})
                }).comment_id
            private_comments = marked_private_comments + implied_private_comments
            Log.note("The private comments are {{comments}}",
                     {"comments": private_comments})

            #ATTACHMENTS
            attachments = db.query("SELECT bug_id, attach_id FROM attachments")
            private_attachments = Random.sample(attachments, 5)
            Log.note("The private attachments are {{attachments}}",
                     {"attachments": private_attachments})
            for a in private_attachments:
                database.mark_attachment_private(db, a.attach_id, isprivate=1)

        if not File(self.settings.param.last_run_time).exists:
            Log.error("last_run_time should exist")
        bz_etl.main(self.settings, es, es_c)

        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_no_private_bugs(es, private_bugs)
        verify_no_private_attachments(es, private_attachments)
        verify_no_private_comments(es_c, private_comments)

        #MARK SOME STUFF PUBLIC

        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_public_bugs(es, private_bugs)
        verify_no_private_attachments(es, private_attachments)
        verify_no_private_comments(es_c, marked_private_comments)
示例#27
0
    def test_recent_private_stuff_does_not_show(self):
        self.settings.param.allow_private_bugs = False
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        database.make_test_instance(self.settings.bugzilla)

        es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        #MARK SOME STUFF PRIVATE
        with DB(self.settings.bugzilla) as db:
            #BUGS
            private_bugs = set(Random.sample(self.settings.param.bugs, 3))
            Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs})
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

            #COMMENTS
            comments = db.query("SELECT comment_id FROM longdescs").comment_id
            marked_private_comments = Random.sample(comments, 5)
            for c in marked_private_comments:
                database.mark_comment_private(db, c, isprivate=1)

            #INCLUDE COMMENTS OF THE PRIVATE BUGS
            implied_private_comments = db.query("""
                SELECT comment_id FROM longdescs WHERE {{where}}
            """, {
                "where": esfilter2sqlwhere(db, {"terms":{"bug_id":private_bugs}})
            }).comment_id
            private_comments = marked_private_comments + implied_private_comments
            Log.note("The private comments are {{comments}}", {"comments": private_comments})

            #ATTACHMENTS
            attachments = db.query("SELECT bug_id, attach_id FROM attachments")
            private_attachments = Random.sample(attachments, 5)
            Log.note("The private attachments are {{attachments}}", {"attachments": private_attachments})
            for a in private_attachments:
                database.mark_attachment_private(db, a.attach_id, isprivate=1)

        if not File(self.settings.param.last_run_time).exists:
            Log.error("last_run_time should exist")
        bz_etl.main(self.settings, es, es_c)

        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_no_private_bugs(es, private_bugs)
        verify_no_private_attachments(es, private_attachments)
        verify_no_private_comments(es_c, private_comments)

        #MARK SOME STUFF PUBLIC

        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        verify_public_bugs(es, private_bugs)
        verify_no_private_attachments(es, private_attachments)
        verify_no_private_comments(es_c, marked_private_comments)
示例#28
0
    def test_changes_to_private_bugs_still_have_bug_group(self):
        self.settings.param.allow_private_bugs = True
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))

        Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        # MAKE A CHANGE TO THE PRIVATE BUGS
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0]
                new_bug = old_bug.copy()

                new_bug.bug_status = "NEW STATUS"
                diff(db, "bugs", old_bug, new_bug)


        #RUN INCREMENTAL
        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG GROUP STILL EXISTS
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        now = datetime.utcnow()
        results = es.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [
                    {"terms": {"bug_id": private_bugs}},
                    {"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}}
                ]}
            }},
            "from": 0,
            "size": 200000,
            "sort": []
        })
        latest_bugs = Q.select(results.hits.hits, "_source")
        latest_bugs_index = Q.unique_index(latest_bugs, "bug_id")  # IF NOT UNIQUE, THEN ETL IS WRONG

        for bug_id in private_bugs:
            if latest_bugs_index[bug_id] == None:
                Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id})

            bug_group = latest_bugs_index[bug_id].bug_group
            if not bug_group:
                Log.error("Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id})
            if BUG_GROUP_FOR_TESTING not in bug_group:
                Log.error("Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", {
                    "bug_id": bug_id,
                    "bug_group": BUG_GROUP_FOR_TESTING
                })
示例#29
0
    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {"and": [
                    {"match_all": {}},
                    {"and": [
                        {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                        {"exists": {"field": "bug_group"}},
                        {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                        {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    ]}
                ]},
                ["bug_id", "bug_group", "modified_ts"]
            )

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak", {
                "num": len(private_ids.keys())
            })

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"terms": {"bug_id": private_ids.keys()}},
                    {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                ]}
            )

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} leaks!! {{bugs}}", {
                    "num": len(leaked_bugs),
                    "bugs": Q.run({
                        "from":leaked_bugs,
                        "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}],
                        "sort":"bug_id"
                    })
                })
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "bug_group": private_ids[b.bug_id],
                        "version": milli2datetime(b)
                    })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(
                self.public_comments,
                {"terms": {"bug_id": private_ids.keys()}},
                limit=20
            )
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms":{"bug_id":leaked_comments.bug_id}}
                    )

                Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", {
                    "num": len(leaked_comments),
                    "comments": leaked_comments
                })

        if bad_news:
            Log.error("Bugs have leaked!")
示例#30
0
    added = set(compressed.keys()) - set(old_aliases.keys())
    removed = set(old_aliases.keys()) - set(compressed.keys())
    common = set(compressed.keys()) & set(old_aliases.keys())

    changed = set()
    for c in common:
        if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(old_aliases[c], pretty=True):
            changed.add(c)

    if added or removed or changed:
        alias_json = CNV.object2JSON(compressed, pretty=True)
        file = File(settings.param.alias_file)
        file.write(alias_json)

        Log.note("{{num}} of {{total}} aliases saved", {
            "num": len(compressed.keys()),
            "total": len(aliases.keys())
        })


def start():
    try:
        settings = startup.read_settings()
        Log.start(settings.debug)
        main(settings, restart=True)
    except Exception, e:
        Log.error("Can not start", e)
    finally:
        Log.stop()


if __name__ == "__main__":
示例#31
0
    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                    {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    {"nested": { #HAS ATTACHMENT.
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"exists": {"field":"attachments.attach_id"}}
                        }}
                    }},
                    {"or":[
                        {"nested": { #PRIVATE ATTACHMENT, OR...
                            "path": "attachments",
                            "query": {"filtered": {
                                "query": {"match_all": {}},
                                "filter": {"term": {"attachments.isprivate": 1}}
                            }}
                        }},
                        {"exists":{"field":"bug_group"}}  # ...PRIVATE BUG
                    ]}
                ]},
                fields=["bug_id", "bug_group", "attachments", "modified_ts"]
            )

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {"or": [
                    {"exists": "bug_group"},
                    {"terms": {"attachments.isprivate": ['1', True, 1]}}
                ]}
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {"or": [
                        {"exists": "bug_group"},
                        {"terms": {"attachments.isprivate": ['1', True, 1]}}
                    ]}
                })

            Log.note("Ensure {{num}} attachments did not leak", {
                "num": len(private_attachments)
            })

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS
                    {"nested": {
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"terms": {"attach_id": private_attachments}}
                        }}
                    }}
                ]}
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "version": b
                    })
                Log.error("Attachments have leaked!")
    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {
                    "and": [
                        {
                            "match_all": {}
                        },
                        {
                            "and": [
                                {
                                    "range": {
                                        "bug_id": {
                                            "gte": min_id,
                                            "lt": max_id
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                },
                                {
                                    "range": {
                                        "expires_on": {
                                            "gte": NOW
                                        }
                                    }
                                },  #CURRENT RECORDS
                                {
                                    "range": {
                                        "modified_ts": {
                                            "lt": A_WHILE_AGO
                                        }
                                    }
                                },  #OF A MINIMUM AGE
                            ]
                        }
                    ]
                },
                ["bug_id", "bug_group", "modified_ts"])

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak",
                     {"num": len(private_ids.keys())})

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "terms": {
                                "bug_id": private_ids.keys()
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        }  # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                    ]
                })

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note(
                    "{{num}} leaks!! {{bugs}}", {
                        "num":
                        len(leaked_bugs),
                        "bugs":
                        Q.run({
                            "from":
                            leaked_bugs,
                            "select": [
                                "bug_id", "bug_version_num", {
                                    "name":
                                    "modified_ts",
                                    "value":
                                    lambda d: CNV.datetime2string(
                                        CNV.milli2datetime(d.modified_ts))
                                }
                            ],
                            "sort":
                            "bug_id"
                        })
                    })
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "bug_group": private_ids[b.bug_id],
                            "version": milli2datetime(b)
                        })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(self.public_comments,
                                  {"terms": {
                                      "bug_id": private_ids.keys()
                                  }},
                                  limit=20)
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms": {
                            "bug_id": leaked_comments.bug_id
                        }})

                Log.warning(
                    "{{num}} comments marked private have leaked!\n{{comments|indent}}",
                    {
                        "num": len(leaked_comments),
                        "comments": leaked_comments
                    })

        if bad_news:
            Log.error("Bugs have leaked!")
示例#33
0
                LEFT JOIN
                    bug_group_map m ON m.bug_id=b.bug_id
                WHERE
                    delta_ts >= {{start_time_str}} AND
                    m.bug_id IS NULL
            """, {"start_time_str": param.start_time_str}), u"bug_id")

    if not bug_list:
        return

    with Thread.run("alias analysis",
                    alias_analysis.main,
                    settings=settings,
                    bug_list=bug_list):
        Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", {
            "num": len(bug_list),
            "bug_list": bug_list
        })
        param.bug_list = bug_list
        run_both_etl(
            **{
                "db": db,
                "output_queue": output_queue,
                "es_comments": es_comments,
                "param": param.copy()
            })


def full_etl(resume_from_last_run, settings, param, db, es, es_comments,
             output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end,
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4  #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([{
            "email": e,
            "count": c
        } for e, c in problem_agg.dic.iteritems()
                           if not aliases.get(e, Null).ignore and
                           (c <= -(DIFF / 2) or last_run)], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(
                        problem.email,
                        0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{
                "email": e,
                "count": c
            } for e, c in solution_agg.dic.iteritems()], [{
                "field": "count",
                "sort": -1
            }, "email"])

            if last_run and len(solutions) == 2 and solutions[
                    0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >=
                                         solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note(
                "{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                    "problem": problem.email,
                    "score": problem.count,
                    "solution": best_solution.email,
                    "matches": CNV.object2JSON(
                        Q.select(solutions, "count")[:10:])
                })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
示例#35
0
                    bugs b
                LEFT JOIN
                    bug_group_map m ON m.bug_id=b.bug_id
                WHERE
                    delta_ts >= {{start_time_str}} AND
                    m.bug_id IS NULL
            """, {
                "start_time_str": param.start_time_str
            }), u"bug_id")

    if not bug_list:
        return

    with Thread.run("alias analysis", alias_analysis.main, settings=settings, bug_list=bug_list):
        Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", {
            "num": len(bug_list),
            "bug_list": bug_list
        })
        param.bug_list = bug_list
        run_both_etl(**{
            "db": db,
            "output_queue": output_queue,
            "es_comments": es_comments,
            "param": param.copy()
        })


def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue):
    with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
        end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
        start = nvl(settings.param.start, 0)
        if resume_from_last_run:
    def test_changes_to_private_bugs_still_have_bug_group(self):
        self.settings.param.allow_private_bugs = True
        File(self.settings.param.first_run_time).delete()
        File(self.settings.param.last_run_time).delete()

        private_bugs = set(Random.sample(self.settings.param.bugs, 3))

        Log.note("The private bugs for this test are {{bugs}}",
                 {"bugs": private_bugs})

        database.make_test_instance(self.settings.bugzilla)

        #MARK SOME BUGS PRIVATE
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)

        es = elasticsearch.make_test_instance("candidate",
                                              self.settings.real.bugs)
        es_c = elasticsearch.make_test_instance("candidate_comments",
                                                self.settings.real.comments)
        bz_etl.main(self.settings, es, es_c)

        # MAKE A CHANGE TO THE PRIVATE BUGS
        with DB(self.settings.bugzilla) as db:
            for b in private_bugs:
                old_bug = db.query(
                    "SELECT * FROM bugs WHERE bug_id={{bug_id}}",
                    {"bug_id": b})[0]
                new_bug = old_bug.copy()

                new_bug.bug_status = "NEW STATUS"
                diff(db, "bugs", old_bug, new_bug)

        #RUN INCREMENTAL
        bz_etl.main(self.settings, es, es_c)

        #VERIFY BUG GROUP STILL EXISTS
        Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
        now = datetime.utcnow()
        results = es.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "terms": {
                                "bug_id": private_bugs
                            }
                        }, {
                            "range": {
                                "expires_on": {
                                    "gte": CNV.datetime2milli(now)
                                }
                            }
                        }]
                    }
                }
            },
            "from": 0,
            "size": 200000,
            "sort": []
        })
        latest_bugs = Q.select(results.hits.hits, "_source")
        latest_bugs_index = Q.unique_index(
            latest_bugs, "bug_id")  # IF NOT UNIQUE, THEN ETL IS WRONG

        for bug_id in private_bugs:
            if latest_bugs_index[bug_id] == None:
                Log.error("Expecting to find the private bug {{bug_id}}",
                          {"bug_id": bug_id})

            bug_group = latest_bugs_index[bug_id].bug_group
            if not bug_group:
                Log.error(
                    "Expecting private bug ({{bug_id}}) to have a bug group",
                    {"bug_id": bug_id})
            if BUG_GROUP_FOR_TESTING not in bug_group:
                Log.error(
                    "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group",
                    {
                        "bug_id": bug_id,
                        "bug_group": BUG_GROUP_FOR_TESTING
                    })
    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  #CURRENT RECORDS
                        {
                            "range": {
                                "modified_ts": {
                                    "lt": A_WHILE_AGO
                                }
                            }
                        },  #OF A MINIMUM AGE
                        {
                            "nested": {  #HAS ATTACHMENT.
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "exists": {
                                                "field":
                                                "attachments.attach_id"
                                            }
                                        }
                                    }
                                }
                            }
                        },
                        {
                            "or": [
                                {
                                    "nested": {  #PRIVATE ATTACHMENT, OR...
                                        "path": "attachments",
                                        "query": {
                                            "filtered": {
                                                "query": {
                                                    "match_all": {}
                                                },
                                                "filter": {
                                                    "term": {
                                                        "attachments.isprivate":
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                }  # ...PRIVATE BUG
                            ]
                        }
                    ]
                },
                fields=["bug_id", "bug_group", "attachments", "modified_ts"])

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {
                    "or": [{
                        "exists": "bug_group"
                    }, {
                        "terms": {
                            "attachments.isprivate": ['1', True, 1]
                        }
                    }]
                }
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {
                        "or": [{
                            "exists": "bug_group"
                        }, {
                            "terms": {
                                "attachments.isprivate": ['1', True, 1]
                            }
                        }]
                    }
                })

            Log.note("Ensure {{num}} attachments did not leak",
                     {"num": len(private_attachments)})

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  # CURRENT BUGS
                        {
                            "nested": {
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "terms": {
                                                "attach_id":
                                                private_attachments
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    ]
                }
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note("{{num}} bugs with private attachments have leaked!",
                         {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has private_attachment\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "version": b
                        })
                Log.error("Attachments have leaked!")