def test_chunks_are_chopped_by_byte_size_properly(self): max_byte_size = 170 chunks = list( helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer())) self.assertEqual(25, len(chunks)) for chunk_data, chunk_actions in chunks: chunk = u"".join(chunk_actions) chunk = chunk if isinstance(chunk, str) else chunk.encode("utf-8") self.assertLessEqual(len(chunk), max_byte_size)
def search(self, body): """ Execute a search query. The passed query must be a valid ElasticSearch query. This query is passed to the connection with the according index and the result is returned. """ self.logger.debug('Execute search: %s', JSONSerializer().dumps(body)) return self.__connection.search( body=body, index='syslog') #config.system.es.index)
def _query_backend(self): consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST, value_deserializer=lambda v: JSONSerializer(). loads(v.decode('utf-8'))) tp = TopicPartition(self.topic, 0) consumer.assign([tp]) count = consumer.position(tp) consumer.seek(tp, 0) metrics = [] for i in range(count): metrics.append(next(consumer)) return metrics
class SospiderPipeline(object): items_buffer = [] serializer = JSONSerializer() def process_item(self, item, spider): data = dict(item) url_key = spider.url_key(item['url']) #if spider.redis_cache.exists(url_key): # return item #else: # spider.redis_cache.set(url_key, item['url']) # spider.redis_cache.expire(url_key, # spider.conf_dict['expire_seconds']) extra_data = data.pop('extra') data.update(extra_data) self.index_item(data, spider) return item def index_item(self, item, spider): index_action = { '_index': spider.es_index, '_type': 'fulltext', '_source': item, '_id': uuid.uuid1(), } logging.info('get %s' % item['url']) try: self.serializer.dumps(index_action) self.items_buffer.append(index_action) except Exception as e: logging.info('dumps failed') if len(self.items_buffer) > BUF_SIZ: self.send_item(spider) self.items_buffer = [] def send_item(self, spider): res = helpers.bulk(spider.es, self.items_buffer) logging.info('bulk %s' % str(res)) def close_spider(self, spider): if len(self.items_buffer): self.send_item()
def to_json(data): """Convert Python structure to JSON used by Elasticsearch This is a helper method that uses the elasticsearch-py JSONSerializer to serialize the structure. This is the serializer that elasticsearch-py uses to serialize data for Elasticsearch and handles dates. :arg data: Python structure (e.g. dict, list, ...) :returns: string Examples: >>> to_json({'query': {'match': {'message': 'test message'}}}) '{"query": {"match": {"message": "test message"}}}' >>> from elasticutils import S >>> some_s = S().query(message__match='test message') >>> to_json(some_s.build_search()) '{"query": {"match": {"message": "test message"}}}' """ return JSONSerializer().dumps(data)
def test_serializes_pandas_series(self): self.assertEqual( '{"d":["a","b","c","d"]}', JSONSerializer().dumps({"d": pd.Series(["a", "b", "c", "d"])}), )
def test_raises_serialization_error_pandas_nat(self): if not hasattr(pd, "NaT"): raise SkipTest("pandas.NaT required") self.assertRaises(SerializationError, JSONSerializer().dumps, {"d": pd.NaT})
def default(self, data): if isinstance(data, set): return list(data) if isinstance(data, bytes): return str(data, encoding='utf-8') return JSONSerializer.default(self, data)
def test_serializes_pandas_timestamp(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": pd.Timestamp("2010-10-01T02:30:00")}), )
def test_serializes_numpy_bool(self): self.assertEqual('{"d":true}', JSONSerializer().dumps({"d": np.bool_(True)}))
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool which takes a weighted listing of keyword searches and presents aggregations of this data to the user.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Displays the top --size logs matching the --errormap mappings.') parser.add_argument( '--size', metavar='size', dest='size', default=10, help='The number of results to be returned. (default=10)') parser.add_argument('-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') parser.add_argument( '--errormap', metavar="file", dest="err_map_file", default=None, help='A map of errors to scan the user jobs for, including weights.') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Load the weighted error mapping. error_map = None if args.err_map_file: error_map = JSONSerializer().loads(open(args.err_map_file).read()) if error_map is None: parser.print_help() print("Error map '%s', could not be loaded" % args.err_map_file) return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") # Finding no matches with valid search criteria is a legit case. # return 0, not 3 if total_hits == None: print("# Sorry. Could not find any matching results.") return 0 if total_hits != 1: print( "This implementation only supports queries where the hit count is equal to 1." ) return 3 # TODO make this code more fault tolerant hits = cast.deep_get(tr_res, "hits", "hits") tr_data = cast.deep_get(hits[0], "_source", "data") # --------------------------------------------------------------------------------------------- # Build the hostnames string: if args.hosts is None: args.hosts = tr_data.get("compute_nodes") hostnames = { "multi_match": { "query": " ".join(args.hosts), "type": "best_fields", "fields": ["hostname", "source"], "tie_breaker": 0.3, "minimum_should_match": 1 } } # --------------------------------------------------------------------------------------------- (ranges, should_match) = cast.build_timestamp_range( tr_data.get("begin_time"), cast.deep_get(tr_data, "history", "end_time")) ranges.append(hostnames) # --------------------------------------------------------------------------------------------- # Build a body for the mapping query. body = { "_source": ["@timestamp"], "size": args.size, } # Check the keywords supplied by the json. results = {} for error in error_map: (category, result) = build_mapping_query(es, body.copy(), ranges, error) results[category] = result print(" ") # Print the results. for category, response in sorted( results.iteritems(), key=lambda (k, v): cast.deep_get(v, "hits", "max_score"), reverse=True): # Get aggregations. aggregations = response.get("aggregations", []) total = cast.deep_get(response, "hits", "total") print("\"{0}\" Max Score : {1}".format( category, cast.deep_get(response, "hits", "max_score"))) print("\"{0}\" Count : {1}".format(category, total)) if aggregations is not None: # Sort aggregations by document count. for (aggregation, value) in sorted(aggregations.iteritems(), key=lambda (k, v): v.get("doc_count"), reverse=True): print(" \"{0}\" : {1}".format(aggregation, value.get("doc_count"))) if args.verbose: hits = cast.deep_get(response, "hits", "hits") print("\nTop {0} \"{1}\" Results:".format(len(hits), category)) print("-" * 42) for hit in hits: print(json.dumps(hit["_source"])) print("=" * 42) print(" ")
def test_strings_are_left_untouched(self): self.assertEqual("你好", JSONSerializer().dumps("你好"))
def test_decimal_serialization(self): if sys.version_info[:2] == (2, 6): raise SkipTest("Float rounding is broken in 2.6.") self.assertEqual('{"d":3.8}', JSONSerializer().dumps({"d": Decimal("3.8")}))
def default(self, data): if isinstance(data, set): return list(data) if isinstance(data, Decimal): return float(data) return JSONSerializer.default(self, data)
def test_raises_serialization_error_on_dump_error(self): self.assertRaises(SerializationError, JSONSerializer().dumps, object())
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description='''A tool for finding jobs running during the specified time range on a specified node.''') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help='An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".') parser.add_argument( '--starttime', metavar='YYYY-MM-DDTHH:MM:SS', dest='starttime', default=None, help='A timestamp representing the beginning of the absolute range to look for failed jobs, if not set no lower bound will be imposed on the search.') parser.add_argument( '--endtime', metavar='YYYY-MM-DDTHH:MM:SS', dest='endtime', default=None, help='A timestamp representing the ending of the absolute range to look for failed jobs, if not set no upper bound will be imposed on the search.') parser.add_argument( '-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') parser.add_argument( '-s', '--size', metavar='size', dest='size', default=1000, help='The number of results to be returned. (default=1000)') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target == None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 (range, match_min) = cast.build_time_range(args.starttime, args.endtime) bool_query={ "should" : range, "minimum_should_match" : match_min } if args.hosts: bool_query["must"] = { "match" : { "data.compute_nodes" : { "query" : " ".join(args.hosts) } } } body={ "query" : { "bool" : bool_query }, "_source" : [ "data.allocation_id", "data.primary_job_id", "data.user_id", "data.user_name", "data.secondary_job_id", "data.begin_time", "data.history.end_time"], "size": args.size } json = JSONSerializer() # Open a connection to the elastic cluster. es = Elasticsearch( args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60 ) # Execute the query on the cast-allocation index. tr_res = es.search( index="cast-allocation", body=body ) # Get Hit Data hits = cast.deep_get(tr_res, "hits", "hits") total_hits = cast.deep_get(tr_res, "hits","total") hits_displayed= len(hits) print("# Search found {0} jobs running, displaying {1} jobs:\n".format(total_hits, len(hits))) # Display the results of the search. if hits_displayed > 0: print_fmt="{5: <10} | {0: >13} | {1: >12} | {2: <14} | {3: <26} | {4: <26}" print(print_fmt.format("Allocation ID", "Prim. Job ID", "Second. Job ID", "Begin Time", "End Time", "User Name")) hits.sort(key=lambda x: cast.deep_get(x,"_source","data","allocation_id"), reverse=False) for hit in hits: data=cast.deep_get(hit, "_source", "data") if data: print(print_fmt.format( data.get("allocation_id"), data.get("primary_job_id"), data.get("secondary_job_id"), data.get("begin_time"), cast.deep_get(data, "history","end_time"), data.get("user_name"))) return 0
from newrelic.api.application import application_instance as application from newrelic.api.background_task import BackgroundTask from testing_support.db_settings import elasticsearch_settings ES_SETTINGS = elasticsearch_settings()[0] HOST = {'host': ES_SETTINGS['host'], 'port': int(ES_SETTINGS['port'])} INDEX = 'contacts' DOC_TYPE = 'person' ID = 1 METHOD = _make_path(INDEX, DOC_TYPE, ID) PARAMS = {} HEADERS = {"Content-Type": "application/json"} DATA = {"name": "Joe Tester"} BODY = JSONSerializer().dumps(DATA).encode('utf-8') def test_transport_get_connection(): app = application() with BackgroundTask(app, 'transport_perform_request') as transaction: transport = Transport([HOST]) transport.get_connection() expected = (ES_SETTINGS['host'], ES_SETTINGS['port'], None) assert transaction._nr_datastore_instance_info == expected def test_transport_perform_request_urllib3(): app = application() with BackgroundTask(app, 'perform_request_urllib3') as transaction:
def test_uuid_serialization(self): self.assertEquals( '{"d": "00000000-0000-0000-0000-000000000003"}', JSONSerializer().dumps( {'d': uuid.UUID('00000000-0000-0000-0000-000000000003')}))
class Command(BaseCommand): json = JSONSerializer() degradation_mapping = get_degradation_mapping() help = "Export indexed documents into machinereadable format" def add_arguments(self, parser): parser.add_argument("--from", default=0, type=int) parser.add_argument("--to", default=None, type=int) parser.add_argument( "--degrade_to_basic_ftm", default=False, action="store_true" ) parser.add_argument("--outfile", type=str) parser.add_argument("--save_index_file", type=str) parser.add_argument("--threads", type=int, default=settings.NUM_THREADS) parser.add_argument("--batch_size", type=int, default=200) parser.add_argument( "datasource", choices=get_apps_with_data_model(), help="Which source should be exported", ) @classmethod def to_entities(cls, rec, degrade): res = [] for ftm in rec.to_entities(): if degrade: ftm.schema = cls.degradation_mapping[ftm.schema.name] res.append(cls.json.dumps(ftm.to_dict())) return res def handle(self, *args, **options): if "datasource" not in options: self.stderr.write("You need to specify datasource to export") return config = django_apps.app_configs[options["datasource"]] Model = config.data_model qs = Model.objects.all() if options["from"] and options["to"] is not None: qs = qs[options["from"] : options["to"]] else: if options["from"]: qs = qs[options["from"] :] elif options["to"] is not None: qs = qs[: options["to"]] total = qs.count() out_fp = sys.stdout if options["outfile"]: if options["outfile"].endswith(".bz2"): out_fp = bz2.open(options["outfile"], "wt") else: out_fp = open(options["outfile"], "w") pool = Pool(options["threads"]) with tqdm.tqdm(total=total) as pbar: for chunk in grouper(qs.iterator(), options["batch_size"]): for ftm in chain.from_iterable( pool.imap( partial( Command.to_entities, degrade=options["degrade_to_basic_ftm"] ), filter(None, chunk), ) ): out_fp.write(ftm + "\n") pbar.update(len(chunk)) if options["save_index_file"]: index_data = {} if os.path.exists(options["save_index_file"]): with open(options["save_index_file"], "r") as fp: try: index_data = { d["dataset_id"]: d for d in self.json.loads(fp.read()) } except SerializationError: self.stderr.write("Cannot load index file, recreating it") index_data[options["datasource"]] = { "foreign_id": "ua_{}".format(options["datasource"]), "country": "Ukraine", "dataset_id": options["datasource"], "records": total, "last_updated": timezone.now(), "information_url": "https://ring.org.ua", "publisher": _("Проект ring.org.ua"), "publisher_url": "https://ring.org.ua", } if os.path.exists(options["outfile"]): index_data[options["datasource"]]["dump_url"] = options["outfile"] from search.models import get_datasource_pages pages = get_datasource_pages() if options["datasource"] in pages: page = pages[options["datasource"]] index_data[options["datasource"]].update( { "information_url": "https://ring.org.ua{}".format( page.get_absolute_url() ), "category": page.category, "source_url": page.url, "description": page.description, "description_en": page.description_en, "credits": page.credits, } ) with open(options["save_index_file"], "w") as fp: fp.write(self.json.dumps(list(index_data.values())))
import json from django.core.exceptions import ImproperlyConfigured from django.db import connection from elasticsearch.serializer import JSONSerializer from django_zombodb.indexes import ZomboDBIndex json_serializer = JSONSerializer() def get_zombodb_index_from_model(model): for index in model._meta.indexes: if isinstance(index, ZomboDBIndex): return index raise ImproperlyConfigured( "Can't find a ZomboDBIndex at model {model}. " "Did you forget it? ".format(model=model)) def _validate_query(index, post_data): with connection.cursor() as cursor: cursor.execute(''' SELECT zdb.request(%(index_name)s, %(endpoint)s, 'POST', %(post_data)s); ''', { 'index_name': index.name, 'endpoint': '_validate/query', 'post_data': post_data
def test_raises_serialization_error_on_load_error(self): self.assertRaises(SerializationError, JSONSerializer().loads, object()) self.assertRaises(SerializationError, JSONSerializer().loads, "") self.assertRaises(SerializationError, JSONSerializer().loads, "{{")
def default(self, obj): if isinstance(obj, set): return list(obj) if isinstance(obj, uuid.UUID): return str(obj) return JSONSerializer.default(self, obj)
def test_datetime_serialization(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": datetime(2010, 10, 1, 2, 30)}), )
def test_serializes_pandas_na(self): if not hasattr(pd, "NA"): # pandas.NA added in v1 raise SkipTest("pandas.NA required") self.assertEqual( '{"d":null}', JSONSerializer().dumps({"d": pd.NA}), )
def test_uuid_serialization(self): self.assertEqual( '{"d":"00000000-0000-0000-0000-000000000003"}', JSONSerializer().dumps( {"d": uuid.UUID("00000000-0000-0000-0000-000000000003")}), )
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description='''A tool for finding jobs running at the specified time.''') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help='An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".') parser.add_argument( '-T', '--time', metavar='YYYY-MM-DD HH:MM:SS', dest='timestamp', default="now", help='A timestamp representing a point in time to search for all running CSM Jobs. HH, MM, SS are optional, if not set they will be initialized to 0. (default=now)') parser.add_argument( '-s', '--size', metavar='size', dest='size', default=1000, help='The number of results to be returned. (default=1000)') parser.add_argument( '-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target == None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Parse the user's date. date_format='(\d{4})-(\d{1,2})-(\d{1,2})[ \.T]*(\d{0,2}):{0,1}(\d{0,2}):{0,1}(\d{0,2})' date_print_format='%Y-%m-%d %H:%M:%S' date_search_format='"yyyy-MM-dd HH:mm:ss"' target_date=args.timestamp time_search=re.search(date_format, target_date) # Build the target timestamp and verify validity. if time_search : (year,month,day,hour,minute,second) = time_search.groups() date = datetime( year=int(year), month=int(month), day=int(day), hour=int(hour if hour else 0), minute=int(minute if minute else 0), second=int(second if second else 0) ) target_date=datetime.strftime(date, date_print_format) elif target_date == "now": target_date=datetime.strftime(datetime.now(), date_print_format) else: parser.print_help() print("Invalid timestamp: {0}".format(target_date)) return 2 (range, match_min) = cast.build_target_time_search(target_date) bool_query={ "should" : range, "minimum_should_match" : match_min } if args.hosts: bool_query["must"] = { "match" : { "data.compute_nodes" : { "query" : " ".join(args.hosts) } } } body={ "query" : { "bool" : bool_query }, "_source" : [ "data.allocation_id", "data.primary_job_id", "data.secondary_job_id", "data.begin_time", "data.history.end_time"], "size": args.size } json = JSONSerializer() # Open a connection to the elastic cluster. es = Elasticsearch( args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60 ) # Execute the query on the cast-allocation index. tr_res = es.search( index="cast-allocation", body=body ) # Get Hit Data hits = cast.deep_get(tr_res, "hits", "hits") total_hits = cast.deep_get(tr_res, "hits","total") hits_displayed= len(hits) print("Search found {0} jobs running at '{2}', displaying {1} jobs:\n".format(total_hits, len(hits), target_date)) # Display the results of the search. if hits_displayed > 0: print_fmt="{0: >13} | {1: >12} | {2: <14} | {3: <26} | {4: <26}" print(print_fmt.format("Allocation ID", "Prim. Job ID", "Second. Job ID", "Begin Time", "End Time")) for hit in hits: data=cast.deep_get(hit, "_source", "data") if data: print(print_fmt.format( data.get("allocation_id"), data.get("primary_job_id"), data.get("secondary_job_id"), data.get("begin_time"), cast.deep_get(data, "history","end_time"))) return 0
def test_serializes_numpy_datetime(self): self.assertEqual( '{"d":"2010-10-01T02:30:00"}', JSONSerializer().dumps({"d": np.datetime64("2010-10-01T02:30:00")}), )
def test_serializes_numpy_nan_to_nan(self): self.assertEqual( '{"d":NaN}', JSONSerializer().dumps({"d": np.nan}), )
def assertDictEqual(self, a, b): default = JSONSerializer().default self.assertEqual( json.dumps(a, sort_keys=True, default=default), json.dumps(b, sort_keys=True, default=default), )
def default(self, data): """entry point""" if isinstance(data, set): return list(data) return JSONSerializer.default(self, data)