def test_serializes_numpy_ndarray(self): self.assertEqual( '{"d":[0,0,0,0,0]}', JSONSerializer().dumps({"d": np.zeros((5, ), dtype=np.uint8)}), ) # This isn't useful for Elasticsearch, just want to make sure it works. self.assertEqual( '{"d":[[0,0],[0,0]]}', JSONSerializer().dumps({"d": np.zeros((2, 2), dtype=np.uint8)}), )
def test_serializes_pandas_category(self): cat = pd.Categorical(["a", "c", "b", "a"], categories=["a", "b", "c"]) self.assertEqual( '{"d":["a","c","b","a"]}', JSONSerializer().dumps({"d": cat}), ) cat = pd.Categorical([1, 2, 3], categories=[1, 2, 3]) self.assertEqual( '{"d":[1,2,3]}', JSONSerializer().dumps({"d": cat}), )
def test_class_serializer(self): class A(object): def to_serializable(self): return {"a": "b"} a = A() self.assertEquals('{"a": "b"}', JSONSerializer().dumps(a))
def test_chunks_are_chopped_by_chunk_size(self): self.assertEquals( 10, len( list( helpers._chunk_actions(self.actions, 10, 99999999, JSONSerializer()))))
def test_chunks_are_chopped_by_byte_size(self): self.assertEqual( 100, len( list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer())) ), )
def main(args): parser = argparse.ArgumentParser( usage='%(prog)s [options] command\n\nVersion\n %(prog)s version ' + str(__version__)) parser.add_argument('--version', action='version', version='%(prog)s ' + str(__version__)) parser.add_argument('lognames', metavar='lognames', nargs='+', help="log files to parse") options = parser.parse_args(args) if len(options.lognames) < 1: sys.stderr.write("error: not enough arguments") parser.print_help() return 1 serializer = JSONSerializer() for logname in options.lognames: with open(logname) as logfile: for event in generate_events(logfile, logname): # the elasticsearch serializer does have a # a dumps method, but we don't use it # because it turns off json.dumps' ensure_ascii # we want to enforce ascii because it's # not actually specified what encoding the # log file is in. We were also getting # invalid utf-8 sequences. sys.stdout.write(json.dumps(event, default=serializer.default)) sys.stdout.write('\n')
def handle(self, *args, **options): json = JSONSerializer() if "datasource" not in options: self.stderr.write("You need to specify datasource to export") return config = django_apps.app_configs[options["datasource"]] ElasticModel = config.elastic_model all_docs = ElasticModel.search() if options["to"] is not None: all_docs = all_docs.query( "match_all")[options["from"]:options["to"]] total_count = all_docs.count() all_docs = all_docs.execute() elif options["from"]: all_docs = all_docs.query("match_all")[options["from"]:].execute() total_count = all_docs.count() all_docs = all_docs.execute() else: total_count = all_docs.count() all_docs = all_docs.scan() for doc in tqdm.tqdm(all_docs, total=total_count): doc_json = doc.to_dict() if not options["keep_service_fields"]: for f in self.service_fields: if f in doc_json: del doc_json[f] options["outfile"].write(json.dumps(doc_json) + "\n")
def test_serializes_pandas_na(self): if not hasattr(pd, "NA"): # pandas.NA added in v1 raise SkipTest("pandas.NA required") self.assertEqual( '{"d":null}', JSONSerializer().dumps({"d": pd.NA}), )
def main(): root = sys.argv[1] key = sys.argv[2] outname = 'tmp/%s' % sanitize_filename(key) if os.path.exists(outname): sys.stderr.write("'%s' already done\n" % str(outname)) sys.exit(0) from elasticsearch.serializer import JSONSerializer serializer = JSONSerializer() try: with gzip.open(outname, 'wb') as out: with open(os.path.join(root, key), mode='rb') as logfile: gzfile = gzip.GzipFile(fileobj=logfile, mode='rb') for event in generate_events(enumerate(gzfile), key): # the elasticsearch serializer does have a # a dumps method, but we don't use it # because it turns off json.dumps' ensure_ascii # we want to enforce ascii because it's # not actually specified what encoding the # log file is in. We were also getting # invalid utf-8 sequences. out.write(json.dumps(event, default=serializer.default)) out.write('\n') except Exception as err: if os.path.exists(outname): os.remove(outname) raise err
def test_uuid_serialization(self): self.assertEqual( '{"d":"00000000-0000-0000-0000-000000000003"}', JSONSerializer().dumps( {"d": uuid.UUID("00000000-0000-0000-0000-000000000003")} ), )
def test_serializes_numpy_floats(self): ser = JSONSerializer() for np_type in ( np.float_, np.float32, np.float64, ): self.assertRegexpMatches(ser.dumps({"d": np_type(1.2)}), r'^\{"d":1\.2[\d]*}$')
def test_chunks_are_chopped_by_byte_size_properly(self): max_byte_size = 170 chunks = list( helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer())) self.assertEqual(25, len(chunks)) for chunk_data, chunk_actions in chunks: chunk = u"".join(chunk_actions) chunk = chunk if isinstance(chunk, str) else chunk.encode("utf-8") self.assertLessEqual(len(chunk), max_byte_size)
def search(self, body): """ Execute a search query. The passed query must be a valid ElasticSearch query. This query is passed to the connection with the according index and the result is returned. """ self.logger.debug('Execute search: %s', JSONSerializer().dumps(body)) return self.__connection.search( body=body, index='syslog') #config.system.es.index)
def _query_backend(self): consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST, value_deserializer=lambda v: JSONSerializer(). loads(v.decode('utf-8'))) tp = TopicPartition(self.topic, 0) consumer.assign([tp]) count = consumer.position(tp) consumer.seek(tp, 0) metrics = [] for i in range(count): metrics.append(next(consumer)) return metrics
def test_serializes_numpy_integers(self): ser = JSONSerializer() for np_type in ( np.int_, np.int8, np.int16, np.int32, np.int64, ): self.assertEqual(ser.dumps({"d": np_type(-1)}), '{"d":-1}') for np_type in ( np.uint8, np.uint16, np.uint32, np.uint64, ): self.assertEqual(ser.dumps({"d": np_type(1)}), '{"d":1}')
def handle(self, *args, **options): json = JSONSerializer() all_decls = Search(index=options["indexes"]).doc_type( NACPDeclaration, Declaration) if options["to"] is not None: all_decls = all_decls.query('match_all')[options["from"]:options["to"]].execute() elif options["from"]: all_decls = all_decls.query('match_all')[options["from"]:].execute() else: all_decls = all_decls.scan() for i, decl in enumerate(all_decls): decl_json = decl.api_response(options["sections"]) options["outfile"].write(json.dumps(decl_json) + "\n") if i and i % 1000 == 0: self.stderr.write("Exported %s declarations" % i)
class SospiderPipeline(object): items_buffer = [] serializer = JSONSerializer() def process_item(self, item, spider): data = dict(item) url_key = spider.url_key(item['url']) #if spider.redis_cache.exists(url_key): # return item #else: # spider.redis_cache.set(url_key, item['url']) # spider.redis_cache.expire(url_key, # spider.conf_dict['expire_seconds']) extra_data = data.pop('extra') data.update(extra_data) self.index_item(data, spider) return item def index_item(self, item, spider): index_action = { '_index': spider.es_index, '_type': 'fulltext', '_source': item, '_id': uuid.uuid1(), } logging.info('get %s' % item['url']) try: self.serializer.dumps(index_action) self.items_buffer.append(index_action) except Exception as e: logging.info('dumps failed') if len(self.items_buffer) > BUF_SIZ: self.send_item(spider) self.items_buffer = [] def send_item(self, spider): res = helpers.bulk(spider.es, self.items_buffer) logging.info('bulk %s' % str(res)) def close_spider(self, spider): if len(self.items_buffer): self.send_item()
def to_json(data): """Convert Python structure to JSON used by Elasticsearch This is a helper method that uses the elasticsearch-py JSONSerializer to serialize the structure. This is the serializer that elasticsearch-py uses to serialize data for Elasticsearch and handles dates. :arg data: Python structure (e.g. dict, list, ...) :returns: string Examples: >>> to_json({'query': {'match': {'message': 'test message'}}}) '{"query": {"match": {"message": "test message"}}}' >>> from elasticutils import S >>> some_s = S().query(message__match='test message') >>> to_json(some_s.build_search()) '{"query": {"match": {"message": "test message"}}}' """ return JSONSerializer().dumps(data)
def test_strings_are_left_untouched(self): self.assertEqual("你好", JSONSerializer().dumps("你好"))
def assertDictEqual(self, a, b): default = JSONSerializer().default self.assertEqual( json.dumps(a, sort_keys=True, default=default), json.dumps(b, sort_keys=True, default=default), )
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool which takes a weighted listing of keyword searches and presents aggregations of this data to the user.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Displays the top --size logs matching the --errormap mappings.') parser.add_argument( '--size', metavar='size', dest='size', default=10, help='The number of results to be returned. (default=10)') parser.add_argument('-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') parser.add_argument( '--errormap', metavar="file", dest="err_map_file", default=None, help='A map of errors to scan the user jobs for, including weights.') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Load the weighted error mapping. error_map = None if args.err_map_file: error_map = JSONSerializer().loads(open(args.err_map_file).read()) if error_map is None: parser.print_help() print("Error map '%s', could not be loaded" % args.err_map_file) return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") # Finding no matches with valid search criteria is a legit case. # return 0, not 3 if total_hits == None: print("# Sorry. Could not find any matching results.") return 0 if total_hits != 1: print( "This implementation only supports queries where the hit count is equal to 1." ) return 3 # TODO make this code more fault tolerant hits = cast.deep_get(tr_res, "hits", "hits") tr_data = cast.deep_get(hits[0], "_source", "data") # --------------------------------------------------------------------------------------------- # Build the hostnames string: if args.hosts is None: args.hosts = tr_data.get("compute_nodes") hostnames = { "multi_match": { "query": " ".join(args.hosts), "type": "best_fields", "fields": ["hostname", "source"], "tie_breaker": 0.3, "minimum_should_match": 1 } } # --------------------------------------------------------------------------------------------- (ranges, should_match) = cast.build_timestamp_range( tr_data.get("begin_time"), cast.deep_get(tr_data, "history", "end_time")) ranges.append(hostnames) # --------------------------------------------------------------------------------------------- # Build a body for the mapping query. body = { "_source": ["@timestamp"], "size": args.size, } # Check the keywords supplied by the json. results = {} for error in error_map: (category, result) = build_mapping_query(es, body.copy(), ranges, error) results[category] = result print(" ") # Print the results. for category, response in sorted( results.iteritems(), key=lambda (k, v): cast.deep_get(v, "hits", "max_score"), reverse=True): # Get aggregations. aggregations = response.get("aggregations", []) total = cast.deep_get(response, "hits", "total") print("\"{0}\" Max Score : {1}".format( category, cast.deep_get(response, "hits", "max_score"))) print("\"{0}\" Count : {1}".format(category, total)) if aggregations is not None: # Sort aggregations by document count. for (aggregation, value) in sorted(aggregations.iteritems(), key=lambda (k, v): v.get("doc_count"), reverse=True): print(" \"{0}\" : {1}".format(aggregation, value.get("doc_count"))) if args.verbose: hits = cast.deep_get(response, "hits", "hits") print("\nTop {0} \"{1}\" Results:".format(len(hits), category)) print("-" * 42) for hit in hits: print(json.dumps(hit["_source"])) print("=" * 42) print(" ")
def test_serializes_pandas_timestamp(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": pd.Timestamp("2010-10-01T02:30:00")}), )
def test_serializes_numpy_datetime(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": np.datetime64("2010-10-01T02:30:00")}), )
def test_serializes_pandas_series(self): self.assertEqual( '{"d":["a","b","c","d"]}', JSONSerializer().dumps({"d": pd.Series(["a", "b", "c", "d"])}), )
def test_serializes_numpy_bool(self): self.assertEqual('{"d":true}', JSONSerializer().dumps({"d": np.bool_(True)}))
def test_decimal_serialization(self): if sys.version_info[:2] == (2, 6): raise SkipTest("Float rounding is broken in 2.6.") self.assertEqual('{"d":3.8}', JSONSerializer().dumps({"d": Decimal("3.8")}))
def test_datetime_serialization(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": datetime(2010, 10, 1, 2, 30)}), )
def test_raises_serialization_error_pandas_nat(self): if not hasattr(pd, "NaT"): raise SkipTest("pandas.NaT required") self.assertRaises(SerializationError, JSONSerializer().dumps, {"d": pd.NaT})
def test_raises_serialization_error_on_load_error(self): self.assertRaises(SerializationError, JSONSerializer().loads, object()) self.assertRaises(SerializationError, JSONSerializer().loads, "") self.assertRaises(SerializationError, JSONSerializer().loads, "{{")
def test_raises_serialization_error_on_dump_error(self): self.assertRaises(SerializationError, JSONSerializer().dumps, object())