Пример #1
0
def generate_event_chunks_from_relay(spark_context, inbound_relay_hostname):
    relay_status.wait_until_running(inbound_relay_hostname)
    buffer_count = 0
    for data in read_chunks_from_relay(inbound_relay_hostname):
        buffer_count += 1
        line_iterator = generate_lines_from_chunk(data)
        yield distribute_and_parse_lines(spark_context, line_iterator)
    logging.info("got %s chunks" % (buffer_count))
Пример #2
0
            hdfs_path = os.getenv("DLTK_HDFS_PATH", "")
            hdfs_url = "%s/%s" % (hdfs_url_base.strip("/"),
                                  hdfs_path.strip("/"))
            logging.info("hdfs_url: %s" % hdfs_url)
            csv_strings = spark_context.textFile(hdfs_url)
            objects = csv_strings.map(
                lambda s:
                {f: v.strip('"')
                 for f, v in zip(fields, s.split(","))})

            with opentracing.tracer.start_active_span('execute-algorithm'):
                output = method_impl(spark_context, objects)

            with opentracing.tracer.start_active_span(
                    'wait-for-to-output-relay'):
                relay_status.wait_until_running(outbound_relay_hostname)

            with opentracing.tracer.start_active_span('send-to-output-relay'):
                logging.info("sending output (type=%s) to relay..." %
                             (type(output)))
                from pyspark.rdd import RDD
                if isinstance(output, RDD):
                    output_rdd = output
                    output_relay.send_rdd_to_relay(outbound_relay_hostname,
                                                   output_rdd)
                else:
                    output_list = output
                    logging.info("algo returned %s events" % len(output_list))
                    output_relay.send_chunks_to_relay(
                        outbound_relay_hostname,
                        [json.dumps(output_list).encode()])
Пример #3
0
def events_from_hdfs(spark_context, inbound_relay_hostname, hdfs_url):
    relay_status.wait_until_running(inbound_relay_hostname)
    relay_status.wait_until_done(inbound_relay_hostname)
    lines = spark_context.textFile(hdfs_url)
    events = parse_events_from_lines(lines)
    return events
Пример #4
0
def generate_events_from_relay(spark_context, inbound_relay_hostname):
    relay_status.wait_until_running(inbound_relay_hostname)
    line_iterator = generate_lines_from_relay(inbound_relay_hostname)
    return distribute_and_parse_lines(spark_context, line_iterator)