def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4      #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([
            {"email": e, "count": c}
            for e, c in problem_agg.dic.iteritems()
            if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)
        ], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(problem.email, 0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])

            if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                "problem": problem.email,
                "score": problem.count,
                "solution": best_solution.email,
                "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
            })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
示例#2
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [
        {
            "lost": n,
            "found": d.canonical
        }
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical
    ]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
示例#3
0
def main(settings):
    file = File(settings.param.alias_file)
    aliases = CNV.JSON2object(file.read())

    for v in aliases.values():
        v.candidates = CNV.dict2Multiset(v.candidates)

    data = [{
        "lost": n,
        "found": d.canonical
    } for n, d in aliases.items() if d.canonical != None and n != d.canonical]

    sorted = Q.sort(data, "found")
    for s in sorted:
        Log.note("{{found}} == {{lost}}", s)

    clean = {
        n: d.canonical
        for n, d in aliases.items()
        if d.canonical != None and n != d.canonical and n != ""
    }

    rev_clean = struct.inverse(clean)
    Log.note(CNV.object2JSON(rev_clean, pretty=True))

    for k, v in rev_clean.items():
        if len(v) > 3:
            Log.note(CNV.object2JSON({k: v}, pretty=True))
def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(
                    reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}",
                            {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})",
                      {"path": [try_dir, ref_dir]})
示例#5
0
def compare_both(candidate, reference, settings, some_bugs):
    File(settings.param.errors).delete()
    try_dir = settings.param.errors + "/try/"
    ref_dir = settings.param.errors + "/ref/"

    with Timer("Comparing to reference"):
        found_errors = False
        for bug_id in some_bugs:
            try:
                versions = Q.sort(
                    get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
                    "modified_ts")
                # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
                if not versions:
                    max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
                else:
                    max_time = CNV.milli2datetime(versions.last().modified_ts)

                pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time)
                ref_versions = \
                    Q.sort(
                        #ADDED TO FIX OLD PRODUCTION BUG VERSIONS
                        [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
                        "modified_ts"
                    )

                can = CNV.object2JSON(versions, pretty=True)
                ref = CNV.object2JSON(ref_versions, pretty=True)
                if can != ref:
                    found_errors = True
                    File(try_dir + unicode(bug_id) + ".txt").write(can)
                    File(ref_dir + unicode(bug_id) + ".txt").write(ref)
            except Exception, e:
                found_errors = True
                Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e)

        if found_errors:
            Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {
                "path": [try_dir, ref_dir]}
            )
示例#6
0
def old2new(bug, max_date):
    """
    CONVERT THE OLD ES FORMAT TO THE NEW
    THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION
    """
    if bug.everconfirmed != None:
        if bug.everconfirmed == "":
            bug.everconfirmed = None
        else:
            bug.everconfirmed = int(bug.everconfirmed)

    bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))

    if bug.expires_on > max_date:
        bug.expires_on = parse_bug_history.MAX_TIME
    if bug.votes != None:
        bug.votes = int(bug.votes)
    bug.dupe_by = CNV.value2intlist(bug.dupe_by)
    if bug.votes == 0:
        del bug["votes"]
        # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
    #     bug.remaining_time = 0
    if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
        bug.cf_due_date = CNV.datetime2milli(
            CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
        )
    bug.changes = CNV.JSON2object(
        CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
            .replace("\"field_value_removed\":", "\"old_value\":") \
            .replace("\"field_value\":", "\"new_value\":")
    )

    if bug.everconfirmed == 0:
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass
示例#7
0
def etl(db, output_queue, param, please_stop):
    """
    PROCESS RANGE, AS SPECIFIED IN param AND PUSH
    BUG VERSION RECORDS TO output_queue
    """

    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with db_cache_lock:
        if not db_cache:
            with Timer("open connections to db"):
                for f in get_stuff_from_bugzilla:
                    db = DB(db)
                    db_cache.append(db)

    db_results = Queue(max=2**30)
    with db_cache_lock:
        # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
        with AllThread() as all:
            for i, f in enumerate(get_stuff_from_bugzilla):

                def process(target, db, param, please_stop):
                    db_results.extend(target(db, param))

                all.add(process, f, db_cache[i], param.copy())
    db_results.add(Thread.STOP)

    sorted = Q.sort(db_results, [
        "bug_id", "_merge_order", {
            "field": "modified_ts",
            "sort": -1
        }, "modified_by"
    ])

    process = BugHistoryParser(param, output_queue)
    for s in sorted:
        process.processRow(s)
    process.processRow(
        struct.wrap({
            "bug_id": parse_bug_history.STOP_BUG,
            "_merge_order": 1
        }))
def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r),
                                           "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)
示例#9
0
def milli2datetime(r):
    """
    CONVERT ANY longs INTO TIME STRINGS
    """
    try:
        if r == None:
            return None
        elif isinstance(r, basestring):
            return r
        elif Math.is_number(r):
            if CNV.value2number(r) > 800000000000:
                return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
            else:
                return r
        elif isinstance(r, dict):
            output = {}
            for k, v in r.items():
                v = milli2datetime(v)
                if v != None:
                    output[k.lower()] = v
            return output
        elif hasattr(r, '__iter__'):
            output = []
            for v in r:
                v = milli2datetime(v)
                if v != None:
                    output.append(v)
            if not output:
                return None
            try:
                return Q.sort(output)
            except Exception:
                return output
        else:
            return r
    except Exception, e:
        Log.warning("Can not scrub: {{json}}", {"json": r}, e)
示例#10
0
def etl(db, output_queue, param, please_stop):
    """
    PROCESS RANGE, AS SPECIFIED IN param AND PUSH
    BUG VERSION RECORDS TO output_queue
    """

    # CONNECTIONS ARE EXPENSIVE, CACHE HERE
    with db_cache_lock:
        if not db_cache:
            with Timer("open connections to db"):
                for f in get_stuff_from_bugzilla:
                    db = DB(db)
                    db_cache.append(db)

    db_results = Queue(max=2**30)
    with db_cache_lock:
        # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
        with AllThread() as all:
            for i, f in enumerate(get_stuff_from_bugzilla):
                def process(target, db, param, please_stop):
                    db_results.extend(target(db, param))

                all.add(process, f, db_cache[i], param.copy())
    db_results.add(Thread.STOP)

    sorted = Q.sort(db_results, [
        "bug_id",
        "_merge_order",
        {"field": "modified_ts", "sort": -1},
        "modified_by"
    ])

    process = BugHistoryParser(param, output_queue)
    for s in sorted:
        process.processRow(s)
    process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
示例#11
0
        del bug["everconfirmed"]
    if bug.id == "692436_1336314345":
        bug.votes = 3

    try:
        if Math.is_number(bug.cf_last_resolved):
            bug.cf_last_resolved = long(bug.cf_last_resolved)
        else:
            bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
    except Exception, e:
        pass

    bug = transform_bugzilla.rename_attachments(bug)
    for c in bug.changes:
        c.field_name = c.field_name.replace("attachments.", "attachments_")
        if c.attach_id == '':
            c.attach_id = None
        else:
            c.attach_id = CNV.value2int(c.attach_id)

    bug.attachments = Q.sort(bug.attachments, "attach_id")
    for a in bug.attachments:
        a.attach_id = CNV.value2int(a.attach_id)
        for k, v in list(a.items()):
            if k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate"):
                struct.unwrap(a)[k] = CNV.value2int(v) # PREVENT dot (.) INTERPRETATION
                a[k.split(".")[-1].split("_")[-1]] = CNV.value2int(v)

    bug = transform_bugzilla.normalize(bug)
    return bug
示例#12
0
def normalize(bug, old_school=False):
    bug = bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags = Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments = CNV.JSON2object(
                CNV.object2JSON(bug.attachments).replace(
                    "attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k, v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete")
                                                    or k.endswith("ispatch") or
                                                    k.endswith("isprivate")):
                    new_v = CNV.value2int(v)
                    new_k = k[12:]
                    a[k.replace(".", "\.")] = new_v
                    if not old_school:
                        a[new_k] = new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace(
                "attachments_", "attachments.")
            bug.changes = CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST

    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f] = CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error(
                "problem with converting date to milli (value={{value}})",
                {"value": bug[dateField]}, e)
def analysis(settings, last_run, please_stop):
    DIFF = 7
    if last_run:
        DIFF = 4  #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING
    try_again = True

    while try_again and not please_stop:
        #FIND EMAIL MOST NEEDING REPLACEMENT
        problem_agg = Multiset(allow_negative=True)
        for bug_id, agg in bugs.iteritems():
            #ONLY COUNT NEGATIVE EMAILS
            for email, count in agg.dic.iteritems():
                if count < 0:
                    problem_agg.add(alias(email), amount=count)

        problems = Q.sort([{
            "email": e,
            "count": c
        } for e, c in problem_agg.dic.iteritems()
                           if not aliases.get(e, Null).ignore and
                           (c <= -(DIFF / 2) or last_run)], ["count", "email"])

        try_again = False
        for problem in problems:
            if please_stop:
                break

            #FIND MOST LIKELY MATCH
            solution_agg = Multiset(allow_negative=True)
            for bug_id, agg in bugs.iteritems():
                if agg.dic.get(
                        problem.email,
                        0) < 0:  #ONLY BUGS THAT ARE EXPERIENCING THIS problem
                    solution_agg += agg
            solutions = Q.sort([{
                "email": e,
                "count": c
            } for e, c in solution_agg.dic.iteritems()], [{
                "field": "count",
                "sort": -1
            }, "email"])

            if last_run and len(solutions) == 2 and solutions[
                    0].count == -solutions[1].count:
                #exact match
                pass
            elif len(solutions) <= 1 or (solutions[1].count + DIFF >=
                                         solutions[0].count):
                #not distinctive enough
                continue

            best_solution = solutions[0]
            Log.note(
                "{{problem}} ({{score}}) -> {{solution}} {{matches}}", {
                    "problem": problem.email,
                    "score": problem.count,
                    "solution": best_solution.email,
                    "matches": CNV.object2JSON(
                        Q.select(solutions, "count")[:10:])
                })
            try_again = True
            add_alias(problem.email, best_solution.email)

    saveAliases(settings)
def normalize(bug, old_school=False):
    bug=bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags=Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k,v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")):
                    new_v=CNV.value2int(v)
                    new_k=k[12:]
                    a[k.replace(".", "\.")]=new_v
                    if not old_school:
                        a[new_k]=new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.")
            bug.changes=CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST


    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f]=CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)