def _deep_json_to_string(value, depth): """ :param value: SOME STRUCTURE :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED :return: FLATTER STRUCTURE """ if is_data(value): if depth == 0: return strings.limit(value2json(value), LOG_STRING_LENGTH) return { k: _deep_json_to_string(v, depth - 1) for k, v in value.items() } elif is_sequence(value): return strings.limit(value2json(value), LOG_STRING_LENGTH) elif isinstance(value, number_types): return value elif is_text(value): return strings.limit(value, LOG_STRING_LENGTH) elif is_binary(value): return strings.limit(bytes2base64(value), LOG_STRING_LENGTH) elif isinstance(value, (date, datetime)): return datetime2unix(value) else: return strings.limit(value2json(value), LOG_STRING_LENGTH)
def _annotate(cls, item, timestamp, stack_depth): """ :param item: A LogItem THE TYPE OF MESSAGE :param stack_depth: FOR TRACKING WHAT LINE THIS CAME FROM :return: """ item.timestamp = timestamp item.machine = machine_metadata item.template = strings.limit(item.template, 10000) item.format = strings.limit(item.format, 10000) if item.format == None: format = text(item) else: format = item.format.replace("{{", "{{params.") if not format.startswith(CR) and format.find(CR) > -1: format = CR + format if cls.trace: log_format = item.format = "{{machine.name}} (pid {{machine.pid}}) - {{timestamp|datetime}} - {{thread.name}} - \"{{location.file}}:{{location.line}}\" - ({{location.method}}) - " + format f = sys._getframe(stack_depth + 1) item.location = { "line": f.f_lineno, "file": text(f.f_code.co_filename), "method": text(f.f_code.co_name) } thread = _Thread.current() item.thread = {"name": thread.name, "id": thread.id} else: log_format = item.format = "{{timestamp|datetime}} - " + format cls.main_log.write(log_format, item.__data__())
def post(self, path, **kwargs): url = self.settings.host + ":" + unicode(self.settings.port) + path try: wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate" data = kwargs.get(b'data') if data == None: pass elif isinstance(data, Mapping): kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data)) elif not isinstance(kwargs["data"], str): Log.error("data must be utf8 encoded string") if self.debug: sample = kwargs.get(b'data', "")[:300] Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample) if self.debug: Log.note("POST {{url}}", url=url) response = http.post(url, **kwargs) if response.status_code not in [200, 201]: Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000)) if self.debug: Log.note("response: {{response}}", response=utf82unicode(response.content)[:130]) details = mo_json.json2value(utf82unicode(response.content)) if details.error: Log.error(convert.quote2string(details.error)) if details._shards.failed > 0: Log.error("Shard failures {{failures|indent}}", failures="---\n".join(r.replace(";", ";\n") for r in details._shards.failures.reason) ) return details except Exception as e: if url[0:4] != "http": suggestion = " (did you forget \"http://\" prefix on the host name?)" else: suggestion = "" if kwargs.get("data"): Log.error( "Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}", url=url, body=strings.limit(kwargs["data"], 100 if self.debug else 10000), cause=e ) else: Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
def post(self, path, **kwargs): url = self.settings.host + ":" + unicode(self.settings.port) + path try: wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate" data = kwargs.get(b'data') if data == None: pass elif isinstance(data, Mapping): kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data)) elif not isinstance(kwargs["data"], str): Log.error("data must be utf8 encoded string") if self.debug: sample = kwargs.get(b'data', "")[:300] Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample) if self.debug: Log.note("POST {{url}}", url=url) response = http.post(url, **kwargs) if response.status_code not in [200, 201]: Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000)) if self.debug: Log.note("response: {{response}}", response=utf82unicode(response.content)[:130]) details = mo_json.json2value(utf82unicode(response.content)) if details.error: Log.error(convert.quote2string(details.error)) if details._shards.failed > 0: Log.error("Shard failures {{failures|indent}}", failures="---\n".join(r.replace(";", ";\n") for r in details._shards.failures.reason) ) return details except Exception, e: if url[0:4] != "http": suggestion = " (did you forget \"http://\" prefix on the host name?)" else: suggestion = "" if kwargs.get("data"): Log.error( "Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}", url=url, body=strings.limit(kwargs["data"], 100 if self.debug else 10000), cause=e ) else: Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ json_string = text(json_string) if not is_text(json_string) and json_string.__class__.__name__ != "FileString": Log.error("only unicode json accepted") try: if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) if flexible: value = hjson2value(json_string) else: value = to_data(json_decoder(text(json_string))) if leaves: value = leaves_to_data(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error(CAN_NOT_DECODE_JSON + " at:\n\t{{sample}}\n\t{{pointer}}\n", sample=sample, pointer=pointer) base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception: char_str = " " Log.error(CAN_NOT_DECODE_JSON + ":\n{{char_str}}\n{{hexx_str}}\n", char_str=char_str, hexx_str=hexx_str, cause=e)
def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self
def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self
def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text(Except.wrap(e))) return self
def minimize_repo(repo): """ RETURN A MINIMAL VERSION OF THIS CHANGESET """ if repo == None: return Null output = wrap(_copy_but(repo, _exclude_from_repo)) output.changeset.description = strings.limit(output.changeset.description, 1000) return output
def minimize_repo(repo): """ RETURN A MINIMAL VERSION OF THIS CHANGESET """ if repo == None: return Null output = wrap(_copy_but(repo, _exclude_from_repo)) output.changeset.description = strings.limit(output.changeset.description, 1000) return output
def _deep_json_to_string(value, depth): """ :param value: SOME STRUCTURE :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED :return: FLATTER STRUCTURE """ if isinstance(value, Mapping): if depth == 0: return strings.limit(value2json(value), LOG_STRING_LENGTH) return {k: _deep_json_to_string(v, depth - 1) for k, v in value.items()} elif isinstance(value, list): return strings.limit(value2json(value), LOG_STRING_LENGTH) elif isinstance(value, (float, int, long)): return value elif isinstance(value, basestring): return strings.limit(value, LOG_STRING_LENGTH) else: return strings.limit(value2json(value), LOG_STRING_LENGTH)
def _normalize_revision(self, r, found_revision, push, get_diff): new_names = set(r.keys()) - {"rev", "node", "user", "description", "desc", "date", "files", "backedoutby", "parents", "children", "branch", "tags", "pushuser", "pushdate", "pushid", "phase", "bookmarks"} if new_names and not r.tags: Log.warning("hg is returning new property names ({{names}})", names=new_names) changeset = Changeset( id=r.node, id12=r.node[0:12], author=r.user, description=strings.limit(coalesce(r.description, r.desc), 2000), date=parse_hg_date(r.date), files=r.files, backedoutby=r.backedoutby if r.backedoutby else None, bug=self._extract_bug_id(r.description) ) rev = Revision( branch=found_revision.branch, index=r.rev, changeset=changeset, parents=unwraplist(list(set(r.parents))), children=unwraplist(list(set(r.children))), push=push, phase=r.phase, bookmarks=unwraplist(r.bookmarks), etl={"timestamp": Date.now().unix, "machine": machine_metadata} ) r.pushuser = None r.pushdate = None r.pushid = None r.node = None r.user = None r.desc = None r.description = None r.date = None r.files = None r.backedoutby = None r.parents = None r.children = None r.bookmarks = None set_default(rev, r) # ADD THE DIFF if get_diff or GET_DIFF: rev.changeset.diff = self._get_json_diff_from_hg(rev) try: _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE) with self.es_locker: self.es.add({"id": _id, "value": rev}) except Exception as e: Log.warning("did not save to ES", cause=e) return rev
def delete(self, path, **kwargs): url = self.settings.host + ":" + unicode(self.settings.port) + path try: response = http.delete(url, **kwargs) if response.status_code not in [200]: Log.error(response.reason+": "+response.all_content) if self.debug: Log.note("response: {{response}}", response=strings.limit(utf82unicode(response.all_content), 130)) details = wrap(mo_json.json2value(utf82unicode(response.all_content))) if details.error: Log.error(details.error) return details except Exception as e: Log.error("Problem with call to {{url}}", url=url, cause=e)
def delete(self, path, **kwargs): url = self.settings.host + ":" + unicode(self.settings.port) + path try: response = http.delete(url, **kwargs) if response.status_code not in [200]: Log.error(response.reason+": "+response.all_content) if self.debug: Log.note("response: {{response}}", response=strings.limit(utf82unicode(response.all_content), 130)) details = wrap(mo_json.json2value(utf82unicode(response.all_content))) if details.error: Log.error(details.error) return details except Exception, e: Log.error("Problem with call to {{url}}", url=url, cause=e)
def _deep_json_to_string(value, depth): """ :param value: SOME STRUCTURE :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED :return: FLATTER STRUCTURE """ if is_data(value): if depth == 0: return strings.limit(value2json(value), LOG_STRING_LENGTH) return {k: _deep_json_to_string(v, depth - 1) for k, v in value.items()} elif is_sequence(value): return strings.limit(value2json(value), LOG_STRING_LENGTH) elif isinstance(value, number_types): return value elif is_text(value): return strings.limit(value, LOG_STRING_LENGTH) elif is_binary(value): return strings.limit(bytes2base64(value), LOG_STRING_LENGTH) elif isinstance(value, (date, datetime)): return datetime2unix(value) else: return strings.limit(value2json(value), LOG_STRING_LENGTH)
def _deep_json_to_string(value, depth): """ :param value: SOME STRUCTURE :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED :return: FLATTER STRUCTURE """ if isinstance(value, Mapping): if depth == 0: return strings.limit(value2json(value), LOG_STRING_LENGTH) return { k: _deep_json_to_string(v, depth - 1) for k, v in value.items() } elif isinstance(value, (list, FlatList)): return strings.limit(value2json(value), LOG_STRING_LENGTH) elif isinstance(value, number_types): return value elif isinstance(value, text_type): return strings.limit(value, LOG_STRING_LENGTH) elif isinstance(value, binary_type): return strings.limit(bytes2base64(value), LOG_STRING_LENGTH) else: return strings.limit(value2json(value), LOG_STRING_LENGTH)
def minimize_repo(repo): # output = set_default({}, _exclude_from_repo, repo) output = wrap(_copy_but(repo, _exclude_from_repo)) output.changeset.description = strings.limit(output.changeset.description, 1000) return output
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception as e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" + pointer + "\n") base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception as e: char_str = " " Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n", e)
def extend(self, records): """ records - MUST HAVE FORM OF [{"value":value}, ... {"value":value}] OR [{"json":json}, ... {"json":json}] OPTIONAL "id" PROPERTY IS ALSO ACCEPTED """ if self.settings.read_only: Log.error("Index opened in read only mode, no changes allowed") lines = [] try: for r in records: id = r.get("id") r_value = r.get('value') if id == None and r_value: id = r_value.get('_id') if id == None: id = random_id() if "json" in r: json_bytes = r["json"].encode("utf8") elif r_value or isinstance(r_value, (dict, Data)): json_bytes = convert.value2json(r_value).encode("utf8") else: json_bytes = None Log.error("Expecting every record given to have \"value\" or \"json\" property") lines.append(b'{"index":{"_id": ' + convert.value2json(id).encode("utf8") + b'}}') if self.settings.tjson: lines.append(json2typed(json_bytes.decode('utf8')).encode('utf8')) else: lines.append(json_bytes) del records if not lines: return with Timer("Add {{num}} documents to {{index}}", {"num": len(lines) / 2, "index":self.settings.index}, debug=self.debug): try: data_bytes = b"\n".join(l for l in lines) + b"\n" except Exception, e: Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e) response = self.cluster.post( self.path + "/_bulk", data=data_bytes, headers={"Content-Type": "text"}, timeout=self.settings.timeout, retry=self.settings.retry, params={"consistency": self.settings.consistency} ) items = response["items"] fails = [] if self.cluster.version.startswith("0.90."): for i, item in enumerate(items): if not item.index.ok: fails.append(i) elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])): for i, item in enumerate(items): if item.index.status not in [200, 201]: fails.append(i) else: Log.error("version not supported {{version}}", version=self.cluster.version) if fails: if len(fails) <= 3: cause = [ Except( template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}", status=items[i].index.status, error=items[i].index.error, some=len(fails) - 1, line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000), index=self.settings.index, id=items[i].index._id ) for i in fails ] else: i=fails[0] cause = Except( template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}", status=items[i].index.status, error=items[i].index.error, some=len(fails) - 1, line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000), index=self.settings.index, id=items[i].index._id ) Log.error("Problems with insert", cause=cause) except Exception, e: if e.message.startswith("sequence item "): Log.error("problem with {{data}}", data=repr(lines[int(e.message[14:16].strip())]), cause=e) Log.error("problem sending to ES", e)
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def diff_to_json(unified_diff): """ CONVERT UNIFIED DIFF TO EASY-TO-STORE JSON FORMAT :param unified_diff: text :return: JSON details """ output = [] files = FILE_SEP.split(unified_diff)[1:] for file_ in files: changes = [] old_file_header, new_file_header, file_diff = file_.split("\n", 2) old_file_path = old_file_header[ 1:] # eg old_file_header == "a/testing/marionette/harness/marionette_harness/tests/unit/unit-tests.ini" new_file_path = new_file_header[ 5:] # eg new_file_header == "+++ b/tests/resources/example_file.py" c = 0, 0 hunks = HUNK_SEP.split(file_diff)[1:] for hunk in hunks: line_diffs = hunk.split("\n") old_start, old_length, new_start, new_length = HUNK_HEADER.match( line_diffs[0]).groups() next_c = max(0, int(new_start) - 1), max(0, int(old_start) - 1) if next_c[0] - next_c[1] != c[0] - c[1]: Log.error("expecting a skew of {{skew}}", skew=next_c[0] - next_c[1]) if c[0] > next_c[0]: Log.error("can not handle out-of-order diffs") while c[0] != next_c[0]: c = no_change(c) for line in line_diffs[1:]: if not line: continue if (line.startswith("new file mode") or line.startswith("deleted file mode") or line.startswith("index ") or line.startswith("diff --git")): # HAPPENS AT THE TOP OF NEW FILES # diff --git a/security/sandbox/linux/SandboxFilter.cpp b/security/sandbox/linux/SandboxFilter.cpp # u'new file mode 100644' # u'deleted file mode 100644' # index a763e390731f5379ddf5fa77090550009a002d13..798826525491b3d762503a422b1481f140238d19 # GIT binary patch # literal 30804 break d = line[0] if d == '+': changes.append({ "new": { "line": int(c[0]), "content": strings.limit(line[1:], MAX_CONTENT_LENGTH) } }) elif d == '-': changes.append({ "old": { "line": int(c[1]), "content": strings.limit(line[1:], MAX_CONTENT_LENGTH) } }) try: c = MOVE[d](c) except Exception as e: Log.warning("bad line {{line|quote}}", line=line, cause=e) output.append({ "new": { "name": new_file_path }, "old": { "name": old_file_path }, "changes": changes }) return wrap(output)
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join( remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception as e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" + pointer + "\n") base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join( (c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception as e: char_str = " " Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n", e)
def _normalize_revision(self, r, found_revision, push, get_diff, get_moves): new_names = set(r.keys()) - KNOWN_TAGS if new_names and not r.tags: Log.warning( "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}", names=new_names, changeset=r.node, url=found_revision.branch.url, ) changeset = Changeset( id=r.node, id12=r.node[0:12], author=coalesce(r.author, r.user), description=strings.limit(coalesce(r.description, r.desc), 2000), date=parse_hg_date(r.date), files=r.files, backedoutby=r.backedoutby, backsoutnodes=r.backsoutnodes, bug=mo_math.UNION(([int(b) for b in r.bugs.no], self._extract_bug_id(r.description))), ) rev = Revision( branch=found_revision.branch, index=r.rev, changeset=changeset, parents=set(r.parents), children=set(r.children), push=push, phase=r.phase, bookmarks=unwraplist(r.bookmarks), landingsystem=r.landingsystem, etl={ "timestamp": Date.now().unix, "machine": machine_metadata }, ) rev = elasticsearch.scrub(rev) r.pushuser = None r.pushdate = None r.pushid = None r.node = None r.user = None r.desc = None r.description = None r.date = None r.files = None r.backedoutby = None r.parents = None r.children = None r.bookmarks = None r.landingsystem = None r.extra = None r.author = None r.pushhead = None r.reviewers = None r.bugs = None r.treeherderrepourl = None r.backsoutnodes = None r.treeherderrepo = None r.perfherderurl = None r.branch = None r.phase = None r.rev = None r.tags = None set_default(rev, r) # ADD THE DIFF if get_diff: rev.changeset.diff = self._get_json_diff_from_hg(rev) try: _id = (coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)) with self.repo_locker: self.repo.add({"id": _id, "value": rev}) if get_moves: rev.changeset.moves = self._get_moves_from_hg(rev) with self.moves_locker: self.moves.add({"id": _id, "value": rev}) except Exception as e: e = Except.wrap(e) Log.warning( "Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e, ) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() if "FORBIDDEN/12/index read-only" in e: pass # KNOWN FAILURE MODE return rev
def extend(self, records): """ records - MUST HAVE FORM OF [{"value":value}, ... {"value":value}] OR [{"json":json}, ... {"json":json}] OPTIONAL "id" PROPERTY IS ALSO ACCEPTED """ if self.settings.read_only: Log.error("Index opened in read only mode, no changes allowed") lines = [] try: for r in records: id = r.get("id") r_value = r.get('value') if id == None and r_value: id = r_value.get('_id') if id == None: id = random_id() if "json" in r: json_bytes = r["json"].encode("utf8") elif r_value or isinstance(r_value, (dict, Data)): json_bytes = convert.value2json(r_value).encode("utf8") else: json_bytes = None Log.error("Expecting every record given to have \"value\" or \"json\" property") lines.append(b'{"index":{"_id": ' + convert.value2json(id).encode("utf8") + b'}}') if self.settings.tjson: lines.append(json2typed(json_bytes.decode('utf8')).encode('utf8')) else: lines.append(json_bytes) del records if not lines: return with Timer("Add {{num}} documents to {{index}}", {"num": len(lines) / 2, "index":self.settings.index}, debug=self.debug): try: data_bytes = b"\n".join(l for l in lines) + b"\n" except Exception as e: Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e) response = self.cluster.post( self.path + "/_bulk", data=data_bytes, headers={"Content-Type": "text"}, timeout=self.settings.timeout, retry=self.settings.retry, params={"consistency": self.settings.consistency} ) items = response["items"] fails = [] if self.cluster.version.startswith("0.90."): for i, item in enumerate(items): if not item.index.ok: fails.append(i) elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])): for i, item in enumerate(items): if item.index.status not in [200, 201]: fails.append(i) else: Log.error("version not supported {{version}}", version=self.cluster.version) if fails: if len(fails) <= 3: cause = [ Except( template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}", status=items[i].index.status, error=items[i].index.error, some=len(fails) - 1, line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000), index=self.settings.index, id=items[i].index._id ) for i in fails ] else: i=fails[0] cause = Except( template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}", status=items[i].index.status, error=items[i].index.error, some=len(fails) - 1, line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000), index=self.settings.index, id=items[i].index._id ) Log.error("Problems with insert", cause=cause) except Exception as e: if e.message.startswith("sequence item "): Log.error("problem with {{data}}", data=repr(lines[int(e.message[14:16].strip())]), cause=e) Log.error("problem sending to ES", e)
def _normalize_revision(self, r, found_revision, push, get_diff, get_moves): new_names = set(r.keys()) - KNOWN_TAGS if new_names and not r.tags: Log.warning( "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}", names=new_names, changeset=r.node, url=found_revision.branch.url ) changeset = Changeset( id=r.node, id12=r.node[0:12], author=r.user, description=strings.limit(coalesce(r.description, r.desc), 2000), date=parse_hg_date(r.date), files=r.files, backedoutby=r.backedoutby if r.backedoutby else None, bug=self._extract_bug_id(r.description) ) rev = Revision( branch=found_revision.branch, index=r.rev, changeset=changeset, parents=unwraplist(list(set(r.parents))), children=unwraplist(list(set(r.children))), push=push, phase=r.phase, bookmarks=unwraplist(r.bookmarks), landingsystem=r.landingsystem, etl={"timestamp": Date.now().unix, "machine": machine_metadata} ) r.pushuser = None r.pushdate = None r.pushid = None r.node = None r.user = None r.desc = None r.description = None r.date = None r.files = None r.backedoutby = None r.parents = None r.children = None r.bookmarks = None r.landingsystem = None set_default(rev, r) # ADD THE DIFF if get_diff: rev.changeset.diff = self._get_json_diff_from_hg(rev) if get_moves: rev.changeset.moves = self._get_moves_from_hg(rev) try: _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE) with self.es_locker: self.es.add({"id": _id, "value": rev}) except Exception as e: e = Except.wrap(e) Log.warning("Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() if "FORBIDDEN/12/index read-only" in e: pass # KNOWN FAILURE MODE return rev