def process_spark_partitions(partition):
    ctx = TaskContext()
    nltk.download('punkt')
    logger.info("start_processing_partition partitionId=" +
                str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        if entry["From"] is not None:
            all_records.extend(process_line_spark(entry))

    final_dict = dict()
    for key, rec in all_records:
        if key not in final_dict:
            final_dict[key] = rec
            continue
        current_rec = final_dict[key]
        final_dict[key] = reduceByKeyAndCombine(current_rec, rec)

    all_final_records = []
    for key, rec in final_dict.items():
        all_final_records.append(((key, 1), rec))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_final_records)} records"
    )
    return all_final_records
Exemplo n.º 2
0
def task_info(*_):
    ctx = TaskContext()
    return [
        "Stage: {0}, Partition: {1}, Host: {2}".format(ctx.stageId(),
                                                       ctx.partitionId(),
                                                       socket.gethostname())
    ]
def process_spark_partitions(partition):
    """

    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partition partitionId=" + str(ctx.partitionId()))

    big_taxo = TaxonomyWrapper.get(args, SERVICE_PRINCIPAL_SECRET, logger)
    gensim_model = GensimMagic.get(args, SERVICE_PRINCIPAL_SECRET, logger)  # move this to process_partitions
    de_vocab = gensim_model["vocab"]  # move this to process_partitions
    de_model = gensim_model["model"]  # move this to process_partitions

    words_list = set(de_vocab.keys())
    for domain, domain_dict in big_taxo.items():
        words_list = words_list.union(set(domain_dict.keys()))

    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry, big_taxo, de_model, de_vocab, words_list))
    logger.info(f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records")
    return all_records
def process_spark_partitions(partition):
    """
    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partitionId=" + str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records"
    )
    return all_records
Exemplo n.º 5
0
    def detect_objects(iterator):

        print("detecting objects in the image..")
        ctx = TaskContext()
        partition_id = ctx.partitionId()

        from app.model_pool import ModelPool
        model = ModelPool.get_model(model_name)
        print("partition_id : ", partition_id, "  model : ", model)

        for img_row in iterator:

            row_dict = img_row.asDict()
            filepath = row_dict['origin']
            height = row_dict['height']
            width = row_dict['width']
            nChannels = row_dict['nChannels']

            import os
            filename = os.path.basename(filepath)
            print("filename", filename)

            data = row_dict['data']
            shape = (height, width, nChannels)
            image_np_array = np.ndarray(shape, np.uint8, data)
            resized_image, scale = resize_image(image_np_array)

            boxes, scores, labels = model.predict_on_batch(
                np.expand_dims(resized_image, axis=0))

            for box, score, label in zip(boxes[0], scores[0], labels[0]):
                # scores are sorted so we can break

                if score < float("0.5"):
                    break
                color = label_color(label)
                b = box.astype(int)
                draw_box(resized_image, b, color=color)
                caption = "{} {:.3f}".format(labels_to_names[label], score)
                draw_caption(resized_image, b, caption)

            cv2.imwrite(output_images_dir + "/" + filename, resized_image)