def generate_event_chunks_from_relay(spark_context, inbound_relay_hostname): relay_status.wait_until_running(inbound_relay_hostname) buffer_count = 0 for data in read_chunks_from_relay(inbound_relay_hostname): buffer_count += 1 line_iterator = generate_lines_from_chunk(data) yield distribute_and_parse_lines(spark_context, line_iterator) logging.info("got %s chunks" % (buffer_count))
hdfs_path = os.getenv("DLTK_HDFS_PATH", "") hdfs_url = "%s/%s" % (hdfs_url_base.strip("/"), hdfs_path.strip("/")) logging.info("hdfs_url: %s" % hdfs_url) csv_strings = spark_context.textFile(hdfs_url) objects = csv_strings.map( lambda s: {f: v.strip('"') for f, v in zip(fields, s.split(","))}) with opentracing.tracer.start_active_span('execute-algorithm'): output = method_impl(spark_context, objects) with opentracing.tracer.start_active_span( 'wait-for-to-output-relay'): relay_status.wait_until_running(outbound_relay_hostname) with opentracing.tracer.start_active_span('send-to-output-relay'): logging.info("sending output (type=%s) to relay..." % (type(output))) from pyspark.rdd import RDD if isinstance(output, RDD): output_rdd = output output_relay.send_rdd_to_relay(outbound_relay_hostname, output_rdd) else: output_list = output logging.info("algo returned %s events" % len(output_list)) output_relay.send_chunks_to_relay( outbound_relay_hostname, [json.dumps(output_list).encode()])
def events_from_hdfs(spark_context, inbound_relay_hostname, hdfs_url): relay_status.wait_until_running(inbound_relay_hostname) relay_status.wait_until_done(inbound_relay_hostname) lines = spark_context.textFile(hdfs_url) events = parse_events_from_lines(lines) return events
def generate_events_from_relay(spark_context, inbound_relay_hostname): relay_status.wait_until_running(inbound_relay_hostname) line_iterator = generate_lines_from_relay(inbound_relay_hostname) return distribute_and_parse_lines(spark_context, line_iterator)