Python ElasticSearch примеры, bzETL.util.env.elasticsearch.ElasticSearch Python примеры использования

Пример #1

0

Показать файл

Файл: leak_check.py Проект: klahnakoski/Bugzilla-ETL

 def setUp(self):
     settings = startup.read_settings(filename="leak_check_settings.json")
     Log.start(settings.debug)
     self.private = ElasticSearch(settings.private)
     self.public = ElasticSearch(settings.public)
     self.public_comments = ElasticSearch(settings.public_comments)
     self.settings = settings

Пример #2

0

Показать файл

Файл: bz_etl.py Проект: klahnakoski/Bugzilla-ETL

def setup_es(settings, db, es, es_comments):
    """
    SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER
    """
    current_run_time = get_current_time(db)

    if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists:
        # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX
        last_run_time = long(File(settings.param.last_run_time).read())
        if not es:
            es = ElasticSearch(settings.es)
            es_comments = ElasticSearch(settings.es_comments)
    elif File(settings.param.first_run_time).exists:
        # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL
        try:
            last_run_time = 0
            current_run_time = long(File(settings.param.first_run_time).read())
            if not es:
                if not settings.es.alias:
                    temp = ElasticSearch(settings.es).get_proto(settings.es.index)
                    settings.es.alias = settings.es.index
                    settings.es.index = temp.last()
                es = ElasticSearch(settings.es)
                es.set_refresh_interval(1)  #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY

                if not settings.es_comments.alias:
                    temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index)
                    settings.es_comments.alias = settings.es_comments.index
                    settings.es_comments.index = temp.last()
                es_comments = ElasticSearch(settings.es_comments)
        except Exception, e:
            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)

Пример #3

0

Показать файл

Файл: elasticsearch.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def open_test_instance(name, settings):
    if settings.filename:
        Log.note("Using {{filename}} as {{type}}", {
            "filename": settings.filename,
            "type": name
        })
        return Fake_ES(settings)
    else:
        Log.note("Using ES cluster at {{host}} as {{type}}", {
            "host": settings.host,
            "type": name
        })

        ElasticSearch.delete_index(settings)

        schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True)
        es = ElasticSearch.create_index(settings, schema, limit_replicas=True)
        return es

Пример #4

0

Показать файл

Файл: test_etl.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

    def random_sample_of_bugs(self):
        """
        I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS.  OF COURSE, IT ONLY WORKS
        WHEN I HAVE A REFERENCE TO COMPARE TO
        """
        NUM_TO_TEST = 100
        MAX_BUG_ID = 900000

        with DB(self.settings.bugzilla) as db:
            candidate = elasticsearch.make_test_instance(
                "candidate", self.settings.candidate)
            reference = ElasticSearch(self.settings.private_bugs_reference)

            #GO FASTER BY STORING LOCAL FILE
            local_cache = File(self.settings.param.temp_dir +
                               "/private_bugs.json")
            if local_cache.exists:
                private_bugs = set(CNV.JSON2object(local_cache.read()))
            else:
                with Timer("get private bugs"):
                    private_bugs = compare_es.get_private_bugs(reference)
                    local_cache.write(CNV.object2JSON(private_bugs))

            while True:
                some_bugs = [
                    b for b in
                    [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)]
                    if b not in private_bugs
                ]

                Log.note("Test with the following bug_ids: {{bugs}}",
                         {"bugs": some_bugs})

                #SETUP RUN PARAMETERS
                param = Struct()
                param.end_time = CNV.datetime2milli(get_current_time(db))
                param.start_time = 0
                param.start_time_str = extract_bugzilla.milli2string(db, 0)
                param.alias_file = self.settings.param.alias_file

                try:
                    with ThreadedQueue(candidate, 100) as output:
                        etl(db, output, param, please_stop=None)

                    #COMPARE ALL BUGS
                    Thread.sleep(2)  # MUST SLEEP WHILE ES DOES ITS INDEXING
                    found_errors = compare_both(candidate, reference,
                                                self.settings, some_bugs)
                    if found_errors:
                        Log.note("Errors found")
                        break
                    else:
                        pass
                except Exception, e:
                    Log.warning(
                        "Total failure during compare of bugs {{bugs}}",
                        {"bugs": some_bugs}, e)

Пример #5

0

Показать файл

Файл: leak_check.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

 def setUp(self):
     settings = startup.read_settings(filename="leak_check_settings.json")
     Log.start(settings.debug)
     self.private = ElasticSearch(settings.private)
     self.public = ElasticSearch(settings.public)
     self.public_comments = ElasticSearch(settings.public_comments)
     self.settings = settings

Пример #6

0

Показать файл

def test_replication():
    try:
        settings = startup.read_settings(filename="replication_settings.json")
        Log.start(settings.debug)

        source = ElasticSearch(settings.source)
        destination = replicate.get_or_create_index(settings["destination"],
                                                    source)

        replicate.replicate(source, destination, [537285],
                            CNV.string2datetime("19900101", "%Y%m%d"))
    finally:
        Log.stop()

Пример #7

0

Показать файл

Файл: replicate.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def main(settings):
    #USE A FILE
    if settings.source.filename != None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())
        if transform_bugzilla.USE_ATTACHMENTS_DOT:
            schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))

        dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)

    # GET LAST UPDATED
    time_file = File(settings.param.last_replication_time)
    from_file = None
    if time_file.exists:
        from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
    from_es = get_last_updated(destination)
    last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0))
    current_time = datetime.utcnow()

    pending = get_pending(source, last_updated)
    with ThreadedQueue(destination, size=1000) as data_sink:
        replicate(source, data_sink, pending, last_updated)

    # RECORD LAST UPDATED
    time_file.write(unicode(CNV.datetime2milli(current_time)))

Пример #8

0

Показать файл

Файл: replicate.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

def get_or_create_index(destination_settings, source):
    #CHECK IF INDEX, OR ALIAS, EXISTS
    es = ElasticSearch(destination_settings)
    aliases = es.get_aliases()

    indexes = [a for a in aliases if a.alias == destination_settings.index]
    if not indexes:
        #CREATE INDEX
        schema = source.get_schema()
        assert schema.settings
        assert schema.mappings
        ElasticSearch.create_index(destination_settings, schema, limit_replicas=True)
    elif len(indexes) > 1:
        Log.error("do not know how to replicate to more than one index")
    elif indexes[0].alias != None:
        destination_settings.alias = destination_settings.index
        destination_settings.index = indexes[0].index

    return ElasticSearch(destination_settings)

Пример #9

0

Показать файл

Файл: leak_check.py Проект: klahnakoski/Bugzilla-ETL

class TestLookForLeaks(unittest.TestCase):
    def setUp(self):
        settings = startup.read_settings(filename="leak_check_settings.json")
        Log.start(settings.debug)
        self.private = ElasticSearch(settings.private)
        self.public = ElasticSearch(settings.public)
        self.public_comments = ElasticSearch(settings.public_comments)
        self.settings = settings

    def tearDown(self):
        Log.stop()

    def blocks_of_bugs(self):
        max_bug_id = self.private.search({
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [{"match_all": {}}]}
            }},
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {"0": {"statistical": {"field": "bug_id"}}}
        }).facets["0"].max

        return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment)))

    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {"and": [
                    {"match_all": {}},
                    {"and": [
                        {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                        {"exists": {"field": "bug_group"}},
                        {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                        {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    ]}
                ]},
                ["bug_id", "bug_group", "modified_ts"]
            )

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak", {
                "num": len(private_ids.keys())
            })

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"terms": {"bug_id": private_ids.keys()}},
                    {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                ]}
            )

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} leaks!! {{bugs}}", {
                    "num": len(leaked_bugs),
                    "bugs": Q.run({
                        "from":leaked_bugs,
                        "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}],
                        "sort":"bug_id"
                    })
                })
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "bug_group": private_ids[b.bug_id],
                        "version": milli2datetime(b)
                    })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(
                self.public_comments,
                {"terms": {"bug_id": private_ids.keys()}},
                limit=20
            )
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms":{"bug_id":leaked_comments.bug_id}}
                    )

                Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", {
                    "num": len(leaked_comments),
                    "comments": leaked_comments
                })

        if bad_news:
            Log.error("Bugs have leaked!")



    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}},  #CURRENT RECORDS
                    {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE
                    {"nested": { #HAS ATTACHMENT.
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"exists": {"field":"attachments.attach_id"}}
                        }}
                    }},
                    {"or":[
                        {"nested": { #PRIVATE ATTACHMENT, OR...
                            "path": "attachments",
                            "query": {"filtered": {
                                "query": {"match_all": {}},
                                "filter": {"term": {"attachments.isprivate": 1}}
                            }}
                        }},
                        {"exists":{"field":"bug_group"}}  # ...PRIVATE BUG
                    ]}
                ]},
                fields=["bug_id", "bug_group", "attachments", "modified_ts"]
            )

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {"or": [
                    {"exists": "bug_group"},
                    {"terms": {"attachments.isprivate": ['1', True, 1]}}
                ]}
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {"or": [
                        {"exists": "bug_group"},
                        {"terms": {"attachments.isprivate": ['1', True, 1]}}
                    ]}
                })

            Log.note("Ensure {{num}} attachments did not leak", {
                "num": len(private_attachments)
            })

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {"and": [
                    {"range": {"bug_id": {"gte": min_id, "lt": max_id}}},
                    {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS
                    {"nested": {
                        "path": "attachments",
                        "query": {"filtered": {
                            "query": {"match_all": {}},
                            "filter": {"terms": {"attach_id": private_attachments}}
                        }}
                    }}
                ]}
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms":{"bug_id":leaked_bugs.bug_id}}
                    )

                Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", {
                        "bug_id": b.bug_id,
                        "version": b
                    })
                Log.error("Attachments have leaked!")

Пример #10

0

Показать файл

Файл: bz_etl.py Проект: klahnakoski/Bugzilla-ETL

            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)
    else:
        # START ETL FROM BEGINNING, MAKE NEW INDEX
        last_run_time = 0
        if not es:
            # BUG VERSIONS
            schema = File(settings.es.schema_file).read()
            if transform_bugzilla.USE_ATTACHMENTS_DOT:
                schema = schema.replace("attachments_", "attachments\\.")
            schema=CNV.JSON2object(schema, paths=True)
            schema.settings=jsons.expand_dot(schema.settings)
            if not settings.es.alias:
                settings.es.alias = settings.es.index
                settings.es.index = ElasticSearch.proto_name(settings.es.alias)
            es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True)

            # BUG COMMENTS
            comment_schema = File(settings.es_comments.schema_file).read()
            comment_schema=CNV.JSON2object(comment_schema, paths=True)
            comment_schema.settings=jsons.expand_dot(comment_schema.settings)
            if not settings.es_comments.alias:
                settings.es_comments.alias = settings.es_comments.index
                settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias)
            es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True)

        File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time)))

    return current_run_time, es, es_comments, last_run_time

Пример #11

0

Показать файл

Файл: transform_bugzilla.py Проект: klahnakoski/Bugzilla-ETL

def normalize(bug, old_school=False):
    bug=bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags=Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k,v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")):
                    new_v=CNV.value2int(v)
                    new_k=k[12:]
                    a[k.replace(".", "\.")]=new_v
                    if not old_school:
                        a[new_k]=new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.")
            bug.changes=CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST


    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f]=CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)

Пример #12

0

Показать файл

            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)
    else:
        # START ETL FROM BEGINNING, MAKE NEW INDEX
        last_run_time = 0
        if not es:
            # BUG VERSIONS
            schema = File(settings.es.schema_file).read()
            if transform_bugzilla.USE_ATTACHMENTS_DOT:
                schema = schema.replace("attachments_", "attachments\\.")
            schema = CNV.JSON2object(schema, paths=True)
            schema.settings = jsons.expand_dot(schema.settings)
            if not settings.es.alias:
                settings.es.alias = settings.es.index
                settings.es.index = ElasticSearch.proto_name(settings.es.alias)
            es = ElasticSearch.create_index(settings.es,
                                            schema,
                                            limit_replicas=True)

            # BUG COMMENTS
            comment_schema = File(settings.es_comments.schema_file).read()
            comment_schema = CNV.JSON2object(comment_schema, paths=True)
            comment_schema.settings = jsons.expand_dot(comment_schema.settings)
            if not settings.es_comments.alias:
                settings.es_comments.alias = settings.es_comments.index
                settings.es_comments.index = ElasticSearch.proto_name(
                    settings.es_comments.alias)
            es_comments = ElasticSearch.create_index(settings.es_comments,
                                                     comment_schema,
                                                     limit_replicas=True)

Пример #13

0

Показать файл

def normalize(bug, old_school=False):
    bug = bug.copy()
    bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3]
    bug._id = None

    #ENSURE STRUCTURES ARE SORTED
    # Do some processing to make sure that diffing between runs stays as similar as possible.
    bug.flags = Q.sort(bug.flags, "value")

    if bug.attachments:
        if USE_ATTACHMENTS_DOT:
            bug.attachments = CNV.JSON2object(
                CNV.object2JSON(bug.attachments).replace(
                    "attachments_", "attachments."))
        bug.attachments = Q.sort(bug.attachments, "attach_id")
        for a in bug.attachments:
            for k, v in list(a.items()):
                if k.startswith("attachments") and (k.endswith("isobsolete")
                                                    or k.endswith("ispatch") or
                                                    k.endswith("isprivate")):
                    new_v = CNV.value2int(v)
                    new_k = k[12:]
                    a[k.replace(".", "\.")] = new_v
                    if not old_school:
                        a[new_k] = new_v
            a.flags = Q.sort(a.flags, ["modified_ts", "value"])

    if bug.changes != None:
        if USE_ATTACHMENTS_DOT:
            json = CNV.object2JSON(bug.changes).replace(
                "attachments_", "attachments.")
            bug.changes = CNV.JSON2object(json)
        bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])

    #bug IS CONVERTED TO A 'CLEAN' COPY
    bug = ElasticSearch.scrub(bug)
    # bug.attachments = nvl(bug.attachments, [])    # ATTACHMENTS MUST EXIST

    for f in NUMERIC_FIELDS:
        v = bug[f]
        if v == None:
            continue
        elif f in MULTI_FIELDS:
            bug[f] = CNV.value2intlist(v)
        elif CNV.value2number(v) == 0:
            del bug[f]
        else:
            bug[f] = CNV.value2number(v)

    # Also reformat some date fields
    for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
        v = bug[dateField]
        if v == None: continue
        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error(
                "problem with converting date to milli (value={{value}})",
                {"value": bug[dateField]}, e)

Пример #14

0

Показать файл

            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v.replace("-", "/"),
                                        "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(
                    CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error(
                "problem with converting date to milli (value={{value}})",
                {"value": bug[dateField]}, e)

    bug.votes = None
    bug.exists = True

    return ElasticSearch.scrub(bug)

Пример #15

0

Показать файл

Файл: leak_check.py Проект: Mozilla-GitHub-Standards/4f2f5414775d201b1bb4e317ed501b9d9afa2bd4f57359fded8f19d8638def96

class TestLookForLeaks(unittest.TestCase):
    def setUp(self):
        settings = startup.read_settings(filename="leak_check_settings.json")
        Log.start(settings.debug)
        self.private = ElasticSearch(settings.private)
        self.public = ElasticSearch(settings.public)
        self.public_comments = ElasticSearch(settings.public_comments)
        self.settings = settings

    def tearDown(self):
        Log.stop()

    def blocks_of_bugs(self):
        max_bug_id = self.private.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "match_all": {}
                        }]
                    }
                }
            },
            "from": 0,
            "size": 0,
            "sort": [],
            "facets": {
                "0": {
                    "statistical": {
                        "field": "bug_id"
                    }
                }
            }
        }).facets["0"].max

        return reversed(
            list(Q.intervals(0, max_bug_id, self.settings.param.increment)))

    def test_private_bugs_not_leaking(self):
        bad_news = False

        # FOR ALL BUG BLOCKS
        for min_id, max_id in self.blocks_of_bugs():
            results = get(
                self.private,
                {
                    "and": [
                        {
                            "match_all": {}
                        },
                        {
                            "and": [
                                {
                                    "range": {
                                        "bug_id": {
                                            "gte": min_id,
                                            "lt": max_id
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                },
                                {
                                    "range": {
                                        "expires_on": {
                                            "gte": NOW
                                        }
                                    }
                                },  #CURRENT RECORDS
                                {
                                    "range": {
                                        "modified_ts": {
                                            "lt": A_WHILE_AGO
                                        }
                                    }
                                },  #OF A MINIMUM AGE
                            ]
                        }
                    ]
                },
                ["bug_id", "bug_group", "modified_ts"])

            private_ids = {b.bug_id: b.bug_group for b in results}

            Log.note("Ensure {{num}} bugs did not leak",
                     {"num": len(private_ids.keys())})

            # VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "terms": {
                                "bug_id": private_ids.keys()
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        }  # SOME BUGS WILL LEAK FOR A LITTLE WHILE
                    ]
                })

            if leaked_bugs:
                bad_news = True
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note(
                    "{{num}} leaks!! {{bugs}}", {
                        "num":
                        len(leaked_bugs),
                        "bugs":
                        Q.run({
                            "from":
                            leaked_bugs,
                            "select": [
                                "bug_id", "bug_version_num", {
                                    "name":
                                    "modified_ts",
                                    "value":
                                    lambda d: CNV.datetime2string(
                                        CNV.milli2datetime(d.modified_ts))
                                }
                            ],
                            "sort":
                            "bug_id"
                        })
                    })
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "bug_group": private_ids[b.bug_id],
                            "version": milli2datetime(b)
                        })

            #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG
            leaked_comments = get(self.public_comments,
                                  {"terms": {
                                      "bug_id": private_ids.keys()
                                  }},
                                  limit=20)
            if leaked_comments:
                bad_news = True

                if self.settings.param.delete:
                    self.public_comments.delete_record(
                        {"terms": {
                            "bug_id": leaked_comments.bug_id
                        }})

                Log.warning(
                    "{{num}} comments marked private have leaked!\n{{comments|indent}}",
                    {
                        "num": len(leaked_comments),
                        "comments": leaked_comments
                    })

        if bad_news:
            Log.error("Bugs have leaked!")

    def test_private_attachments_not_leaking(self):
        for min_id, max_id in self.blocks_of_bugs():
            # FIND ALL PRIVATE ATTACHMENTS
            bugs_w_private_attachments = get(
                self.private,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  #CURRENT RECORDS
                        {
                            "range": {
                                "modified_ts": {
                                    "lt": A_WHILE_AGO
                                }
                            }
                        },  #OF A MINIMUM AGE
                        {
                            "nested": {  #HAS ATTACHMENT.
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "exists": {
                                                "field":
                                                "attachments.attach_id"
                                            }
                                        }
                                    }
                                }
                            }
                        },
                        {
                            "or": [
                                {
                                    "nested": {  #PRIVATE ATTACHMENT, OR...
                                        "path": "attachments",
                                        "query": {
                                            "filtered": {
                                                "query": {
                                                    "match_all": {}
                                                },
                                                "filter": {
                                                    "term": {
                                                        "attachments.isprivate":
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                {
                                    "exists": {
                                        "field": "bug_group"
                                    }
                                }  # ...PRIVATE BUG
                            ]
                        }
                    ]
                },
                fields=["bug_id", "bug_group", "attachments", "modified_ts"])

            private_attachments = Q.run({
                "from": bugs_w_private_attachments,
                "select": "attachments.attach_id",
                "where": {
                    "or": [{
                        "exists": "bug_group"
                    }, {
                        "terms": {
                            "attachments.isprivate": ['1', True, 1]
                        }
                    }]
                }
            })
            try:
                private_attachments = [int(v) for v in private_attachments]
            except Exception, e:
                private_attachments = Q.run({
                    "from": bugs_w_private_attachments,
                    "select": "attachments.attach_id",
                    "where": {
                        "or": [{
                            "exists": "bug_group"
                        }, {
                            "terms": {
                                "attachments.isprivate": ['1', True, 1]
                            }
                        }]
                    }
                })

            Log.note("Ensure {{num}} attachments did not leak",
                     {"num": len(private_attachments)})

            #VERIFY NONE IN PUBLIC
            leaked_bugs = get(
                self.public,
                {
                    "and": [
                        {
                            "range": {
                                "bug_id": {
                                    "gte": min_id,
                                    "lt": max_id
                                }
                            }
                        },
                        {
                            "range": {
                                "expires_on": {
                                    "gte": NOW
                                }
                            }
                        },  # CURRENT BUGS
                        {
                            "nested": {
                                "path": "attachments",
                                "query": {
                                    "filtered": {
                                        "query": {
                                            "match_all": {}
                                        },
                                        "filter": {
                                            "terms": {
                                                "attach_id":
                                                private_attachments
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    ]
                }
                # fields=["bug_id", "attachments"]
            )

            #

            if leaked_bugs:
                if self.settings.param.delete:
                    self.public.delete_record(
                        {"terms": {
                            "bug_id": leaked_bugs.bug_id
                        }})

                Log.note("{{num}} bugs with private attachments have leaked!",
                         {"num": len(leaked_bugs)})
                for b in leaked_bugs:
                    Log.note(
                        "{{bug_id}} has private_attachment\n{{version|indent}}",
                        {
                            "bug_id": b.bug_id,
                            "version": b
                        })
                Log.error("Attachments have leaked!")

Пример #16

0

Показать файл

def setup_es(settings, db, es, es_comments):
    """
    SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER
    """
    current_run_time = get_current_time(db)

    if File(settings.param.first_run_time).exists and File(
            settings.param.last_run_time).exists:
        # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX
        last_run_time = long(File(settings.param.last_run_time).read())
        if not es:
            es = ElasticSearch(settings.es)
            es_comments = ElasticSearch(settings.es_comments)
    elif File(settings.param.first_run_time).exists:
        # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL
        try:
            last_run_time = 0
            current_run_time = long(File(settings.param.first_run_time).read())
            if not es:
                if not settings.es.alias:
                    temp = ElasticSearch(settings.es).get_proto(
                        settings.es.index)
                    settings.es.alias = settings.es.index
                    settings.es.index = temp.last()
                es = ElasticSearch(settings.es)
                es.set_refresh_interval(
                    1
                )  #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY

                if not settings.es_comments.alias:
                    temp = ElasticSearch(settings.es_comments).get_proto(
                        settings.es_comments.index)
                    settings.es_comments.alias = settings.es_comments.index
                    settings.es_comments.index = temp.last()
                es_comments = ElasticSearch(settings.es_comments)
        except Exception, e:
            Log.warning("can not resume ETL, restarting", e)
            File(settings.param.first_run_time).delete()
            return setup_es(settings, db, es, es_comments)

Пример #17

0

Показать файл

Файл: transform_bugzilla.py Проект: klahnakoski/Bugzilla-ETL

        try:
            if isinstance(v, date):
                bug[dateField] = CNV.datetime2milli(v)
            elif isinstance(v, long) and len(unicode(v)) in [12, 13]:
                bug[dateField] = v
            elif not isinstance(v, basestring):
                Log.error("situation not handled")
            elif DATE_PATTERN_STRICT.match(v):
                # Convert to "2012/01/01 00:00:00.000"
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
            elif DATE_PATTERN_STRICT_SHORT.match(v):
                # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
                # Example: bug 856732 (cf_last_resolved)
                # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
            elif DATE_PATTERN_RELAXED.match(v):
                # Convert "2012/01/01 00:00:00.000" to "2012-01-01"
                # Example: bug 643420 (deadline)
                #          bug 726635 (cf_due_date)
                bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
        except Exception, e:
            Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)

    bug.votes = None
    bug.exists = True

    return ElasticSearch.scrub(bug)

Python ElasticSearch примеры использования