def lambda_handler(event, context): # TODO implement logger.info("lambda_function: aws_confing_es_endpoint: " + os.environ['aws_config_es_endpoint']) destination = os.environ['aws_config_es_endpoint'] iso_now_time = datetime.datetime.now().isoformat() logger.info("lambda_function: Snapshot Time: " + str(iso_now_time)) event_str = unicode(json.dumps(event)).encode("utf-8") logger.info("lambda_function: event_str: " + event_str) bucket = event['Records'][0]['s3']['bucket']['name'] logger.info("lambda_function: bucket: " + bucket) snapshot_file_path = event['Records'][0]['s3']['object']['key'] snapshot_file_path_unquote = unquote(str(snapshot_file_path)) logger.info("lambda_function: snapshot_file_path_unquote: " + snapshot_file_path_unquote) s3conn = boto3.resource('s3') s3conn.meta.client.download_file(bucket, snapshot_file_path_unquote, DOWNLOADED_SNAPSHOT_FILE_NAME) es = elastic.ElasticSearch(connections=destination, log=None) es.set_not_analyzed_template() data = None if "_ConfigSnapshot_" in snapshot_file_path_unquote: logger.info( "lambda_function: checking if compressed ConfigSnapshot: " + snapshot_file_path_unquote) with gzip.open(DOWNLOADED_SNAPSHOT_FILE_NAME, 'rb') as dataFile: try: data = json.load(dataFile) load_data_into_es(data, iso_now_time, es) except Exception as e: logger.info("lambda_function: compressed: " + e.message) else: logger.info("lambda_function: Not a Config Snapshot file!") if "_ConfigSnapshot_" in snapshot_file_path_unquote: logger.info( "lambda_function: checking if uncompressed ConfigSnapshot: " + snapshot_file_path_unquote) with open(DOWNLOADED_SNAPSHOT_FILE_NAME) as dataFile: try: data = json.load(dataFile) load_data_into_es(data, iso_now_time, es) except Exception as e: logger.info("lambda_function: uncompressed: " + e.message) else: logger.info("lambda_function: Not a Config Snapshot file!") return
def ingest_to_elastic(query_results, index="bing", keys=KEYS): es = elastic.ElasticSearch() es.create_indices(mappings=elastic.BING_MAPPINGS) for record in query_results["value"]: body = dict(zip(keys, [record["description"], record["url"], record["name"], record["provider"][0]["name"], record["datePublished"], record["description"] + record["name"]] ) ) es.add_to_index(es, index, index, body)
def ingest_facts_into_es(): # Initializing Spark Context and reading data in spark = SparkSession.builder.enableHiveSupport().getOrCreate() es = elastic.ElasticSearch() es.create_indices() population_df = spark.read.csv(POPULATION_PATH, header=True) state_rename_udf = functions.udf( lambda x: STATES[x.strip()] if STATES.get(x) else 0, "string") for year in YEARS_AVAILABLE: df = spark.read.csv(os.path.join(ARSON_PATH, FILE_PREFIX + year + ".csv"), header=True) calculate_stats_with_spark(df, year, es) calculate_arson_density(population_df, df, year, state_rename_udf, es)
app_log = logging.getLogger("app") app_log.setLevel(level=logging.INFO) # Setup the verbose logger verbose_log = logging.getLogger("verbose") if args.verbose: verbose_log.setLevel(level=logging.INFO) else: verbose_log.setLevel(level=logging.FATAL) # Mute all other loggers logging.getLogger("root").setLevel(level=logging.FATAL) logging.getLogger("botocore.credentials").setLevel(level=logging.FATAL) logging.getLogger( "botocore.vendored.requests.packages.urllib3.connectionpool").setLevel( level=logging.FATAL) logging.getLogger("boto3").setLevel(level=logging.FATAL) logging.getLogger("requests").setLevel(level=logging.FATAL) destination = None if args.destination is None: app_log.error( "You need to enter the IP of your ElasticSearch instance") exit() destination = "http://" + args.destination verbose_log.info("Setting up the elasticsearch instance") main(args, elastic.ElasticSearch(connections=destination, log=verbose_log))
import collections import json import pandas as pd import elastic from lookup_tables import STATE_CODES, STATES es = elastic.ElasticSearch() COLOR_SCALE = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'], [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']] YEARS_AVAILABLE = map(str, range(2009, 2015)) def build_density_df(hits): d = collections.defaultdict(str) for row in hits: source = row["_source"] d[STATE_CODES[source["state"]]] = source["pop_density"] return pd.DataFrame(d.items(), columns=["state", "pop_density"]) def build_text_df(indices, year): desc = collections.defaultdict(str) for index in indices: response = es.query_index(index, year, field="year") for row in response: source = row["_source"] desc[source["state"]] += index + ": " + str(source[index]) + "<br>"