예제 #1
0
def coco_to_custom_vision(stream, project_id, trainer, data_dir):
    stream = stream | mp.as_list
    tags = stream | mp.select_field('class_id') | mp.dedup() | mp.as_list
    cv_tags = {tag: trainer.create_tag(project_id, tag) for tag in tags}

    stream = (
        stream
        | mp.apply(['width', 'height', 'ground_truth'], 'ground_truth',
                   lambda x: x[2]
                   | mp.where(lambda box: (box['width'] >= x[0] * 0.1) and
                              (box['height'] >= x[1] * 0.1))
                   | mp.as_list)
        | mp.filter('ground_truth', lambda x: len(x) > 0)
        | mp.apply(
            ['width', 'height', 'ground_truth'], 'regions', lambda x: x[2]
            | mp.select(lambda box: Region(tag_id=cv_tags[box['tag']].id,
                                           left=box['x1'] / x[0],
                                           top=box['y1'] / x[1],
                                           width=box['width'] / x[0],
                                           height=box['height'] / x[1]))
            | mp.as_list)
        | mp.apply(['filename', 'regions'], 'tagged_img',
                   lambda x: ImageFileCreateEntry(
                       name=x[0],
                       contents=open(join(data_dir, x[0]), mode="rb").read(),
                       regions=x[1]))
        | mp.as_list)
    tagged_images_with_regions = stream | mp.select_field(
        'tagged_img') | mp.as_list
    for i in range(0, len(tagged_images_with_regions), 50):
        trainer.create_images_from_files(
            project_id, images=tagged_images_with_regions[i:i + 50])
예제 #2
0
def apply_quantized_model(stream, data_dir, model, dest_field):
    return (stream
            | mp.apply(
                'filename', dest_field + '_raw',
                lambda x: model.predict_image(Image.open(join(data_dir, x))))
            | mp.apply([dest_field + '_raw', 'width', 'height'], dest_field,
                       lambda x: x[0]
                       | mp.select(lambda p: format_dict(p, x[1], x[2]))
                       | mp.as_list)
            | mp.delfield([dest_field + '_raw']))
예제 #3
0
def videos_to_frames_pipe(data_dir,
                          ext='.avi',
                          target_ext='.allframes.npy',
                          classes=None,
                          min_size=128,
                          max_elements=13320):
    """
    Creates and executes mPyPl pipe to load all videos from 'data_dir' (each subfolder is a separate class),
    extracts all frames and saves them in numpy format
    
    Parameters
    ----------
    data_dir : str, required
        The directory where all the vidoes are organised in subfolders (subfolder name=class name)
    ext : str, optional
        Extension of video files to search for, by default '.avi'
    target_ext : str, optional
        Target extension for video frames to be serialized to, by default '.allframes.npy'
    classes : dict, optional
        Dictionary with class names and numeral representations. Must match the folder names in 'data_dir'.
        If set to 'None' it will automatically figure out classes based on folders structure in 'data_dir'.
        Example {'Class1': 1, 'Class2': 2}
        Defaults to 'None'
    min_size : int, optional
        Minimum size of frames based on the shorter edge, by default 128
    max_elements : int, optional
        Max elements for silly progress indicator, by default 13320
    """
    (mp.get_datastream(data_dir, classes=classes, ext=ext)
     | mp.apply('filename',
                'clip',
                lambda fn: VideoFileClip(fn),
                eval_strategy=mp.EvalStrategies.Value)
     | mp.apply('clip',
                'clip',
                lambda clip: clip.fx(vfx.resize, width=min_size)
                if clip.w <= clip.h else clip.fx(vfx.resize, height=min_size),
                eval_strategy=mp.EvalStrategies.Value)
     | mp.apply('clip',
                'allframes',
                lambda c: np.asarray(list(c.iter_frames())),
                eval_strategy=mp.EvalStrategies.Value)
     | mp.iter('clip', close_clip)
     | mp.delfield('clip')
     | cachecomputex(
         ext, target_ext, lambda x, nfn: np.save(nfn, x['allframes']),
         lambda x, nfn: print("Skipping saving 'allframes' for {}".format(x[
             'filename'])))
     | mp.silly_progress(elements=max_elements)
     | mp.execute)
예제 #4
0
def get_coco_stream(tags, ann_file, data_dir, threshold=0.1):
    coco = COCO(ann_file)
    catIds = coco.getCatIds(catNms=tags)
    imgIds = sum([coco.getImgIds(catIds=catId) for catId in catIds], [])
    stream = (coco.loadImgs(imgIds)
              | mp.as_field('meta')
              | mp.apply('meta', 'width', lambda x: x['width'])
              | mp.apply('meta', 'height', lambda x: x['height'])
              | mp.apply('meta', 'url', lambda x: x['coco_url'])
              | mp.apply('meta', 'filename', lambda x: x['file_name'])
              | mp.apply(
                  'meta', 'anns_ids', lambda x: coco.getAnnIds(
                      imgIds=x['id'], catIds=catIds, iscrowd=None))
              | mp.apply('anns_ids', 'anns', lambda x: coco.loadAnns(x))
              | mp.apply(
                  'anns', 'ground_truth', lambda x: x
                  | mp.select(lambda m: bbox_to_dict(
                      m['bbox'], coco.cats[m['category_id']]['name']))
                  | mp.as_list)
              | mp.apply(
                  'ground_truth', 'class_id',
                  lambda x: most_common(x
                                        | mp.select(lambda m: m['tag'])
                                        | mp.as_list))
              | mp.iter('meta', lambda x: coco.download(data_dir, [x['id']]))
              | mp.delfield(['meta', 'anns_ids', 'anns']))
    return stream
def precision_recall(data,
                     cls,
                     prob_threshold,
                     iou_treshold,
                     pred_field='predictions',
                     gt_field='ground_truth'):
    TP = 0.0
    TPFP = 0.0  # total positive results / pred
    TPFN = 0.0  # total existing cases / rel
    for obj in data:
        ground_truth = (obj[gt_field]
                        | mp.where(lambda x: x['tag'] == cls)
                        | mp.as_list)
        TPFN += len(ground_truth)
        predictions = (
            obj[pred_field]
            |
            mp.where(lambda x: x['tag'] == cls and x['prob'] > prob_threshold)
            | mp.as_list)
        for gt_box in ground_truth:
            pred_boxes = (
                predictions
                | mp.apply(
                    ['x1', 'y1', 'width', 'height'], 'iou',
                    lambda x: intersection_over_union(x, (gt_box['x1'], gt_box[
                        'y1'], gt_box['width'], gt_box['height'])))
                | mp.filter('iou', lambda x: x < iou_treshold)
                | mp.as_list)
            if len(pred_boxes) > 0:
                TP += 1
                TPFP += len(pred_boxes)
    return (float(TP > 0) if TPFP == 0 else TP / TPFP,
            float(TP > 0) if TPFN == 0 else TP / TPFN)
예제 #6
0
def generate_img(data):
    n = random.randint(3, 30)
    x = (
        data
        | mp.pshuffle
        | mp.take(n)
        | mp.apply(
            'filename', 'image',
            lambda x: cv2.cvtColor(cv2.imread(os.path.splitext(x)[0] + '.jpg'),
                                   cv2.COLOR_BGR2RGB))
        | mp.apply(['image', 'descr'], 'face', transform)
        #| mp.apply('face','facesmall',functools.partial(im_resize,size=(100,150)))
        #| mp.select_field('facesmall')
        | mp.select_field('face')
        | mp.as_list)
    return merge(x, np.random.random(len(x)))
예제 #7
0
def generate_img(data):
    x = (data
         | mp.pshuffle
         | mp.take(args.mix)
         | mp.apply(['image','landmarks'],'face',transform)
         | mp.select_field('face')
         | mp.as_list)
    return merge(x,np.random.random(len(x)))
예제 #8
0
def load_moviepy_video(seq,
                       filename_field='filename',
                       video_field='video',
                       eval_strategy=mp.EvalStrategies.LazyMemoized):
    return seq | mp.apply(filename_field,
                          video_field,
                          lambda x: VideoFileClip(x),
                          eval_strategy=eval_strategy)
예제 #9
0
def get_features_from_files(data_dir,
                            features_ext='.proc.c3d-avg.npy',
                            test_split=[],
                            classes=None,
                            max_elements=13320):
    """
    Creates and executes mPyPl pipe to load feature vectors from serialized files and returns a preprocessed
    data stream that can be further used with respect to train/test split and specific classes assigned to each element in the stream
    
    Parameters
    ----------
    data_dir : str, required
        The directory where all the vidoes are organised in subfolders (subfolder name=class name)
    features_ext : str, optional
        Extension of serialized feature vectors, by default '.proc.c3d-avg.npy'
    test_split : list, optional
        List of filenames belonging to the test subset. 
        If empty then there will be no data in the test subset, by default []
    classes : dict, optional
        Dictionary with class names and numeral representations. Must match the folder names in 'data_dir'.
        If set to 'None' it will automatically figure out classes based on folders structure in 'data_dir'.
        Example: {'Class1': 1, 'Class2': 2}
        Defaults to 'None'
    max_elements : int, optional
        Max elements for silly progress indicator, by default 13320
    
    Returns
    -------
    list of mPyPl.mdict.mdict
        List of dictionaries that can be used to access the data
    """

    data = (mp.get_datastream(data_dir, classes=classes, ext=features_ext)
            | mp.datasplit_by_pattern(test_pattern=test_split)
            | mp.pshuffle
            | mp.apply('filename', 'c3d_avg', lambda fn: np.load(fn))
            | mp.silly_progress(elements=max_elements)
            | mp.select_fields(['c3d_avg', 'class_id', 'split'])
            | mp.as_list)
    return data
from pipe import Pipe
from moviepy.editor import *
import numpy as np
import itertools
import cv2
import math
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential
from keras.layers import *
from keras.regularizers import l2

test_names = (from_json(os.path.join(source_dir, 'matches.json'))
              | mp.where(lambda x: 'Test' in x.keys() and int(x['Test']) > 0)
              | mp.apply(['Id', 'Half'], 'pattern',
                         lambda x: "{}_{}_".format(x[0], x[1]))
              | mp.select_field('pattern')
              | mp.as_list)

no_frames = 126

data = (mp.get_datastream(data_dir, ext=".resized.mp4")
        | mp.pshuffle
        | datasplit_by_pattern(test_pattern=test_names)
        | mp.apply('filename', 'vgg',
                   lambda x: np.load(x.replace('.resized.mp4', '.vgg.npy')))
        | mp.apply('vgg', 'vggflat',
                   lambda x: np.reshape(x, (no_frames, -1, 1)))
        | mp.take(500)
        | mp.as_list)
예제 #11
0
              max(0, y - y_expand):min(im_w, y + h + y_expand)]


def merge(images, wts=None):
    res = np.zeros_like(images[0], dtype=np.float32)
    if wts is None:
        wts = np.ones(len(images))
    wts /= np.sum(wts)
    for n, i in enumerate(images):
        res += wts[n] * i.astype(np.float32)
    return res.astype(np.int32)


data = (mp.get_files(dir, ext='.json')
        | mp.as_field('filename')
        | mp.apply('filename', 'descr', loadjs)
        | mp.unroll('descr')
        | mp.filter(
            'descr', lambda x: person_id == "" or
            ('candidates' in x and person_id in
             [z['personId'] for z in x['candidates']]))
        | mp.filter(
            'descr', lambda x: abs(x['faceAttributes']['headPose']['yaw']) < 15
            and abs(x['faceAttributes']['headPose']['pitch']) < 15)
        | mp.filter(
            'descr', lambda x: x['faceLandmarks']['pupilRight']['x'] - x[
                'faceLandmarks']['pupilLeft']['x'] > 50)
        | mp.as_list)

print("Found {} faces".format(len(data)))
예제 #12
0
def frames_to_features_pipe(data_dir,
                            mean_std,
                            model,
                            ext='.allframes.npy',
                            target_ext='.proc.c3d-avg.npy',
                            classes=None,
                            frames_per_clip=16,
                            frames_step=8,
                            batch_size=32,
                            max_elements=13320):
    """
    Creates and executes mPyPl pipe to load all video frames, resize and crop them, preprocess,
    run inferencing against Keras model and serialize the resulting feature vectors as npy format
    
    Parameters
    ----------
    data_dir : str, required
        The directory where all the vidoes are organised in subfolders (subfolder name=class name)
    mean_std : array, required
        Array of per channel mean and std values used for preprocessing of frames.
        Template: array[ [mean_R, mean_G, mean_B], [std_R, std_G, std_B] ]
        Example: array[ [123, 112, 145], [60, 62, 64] ]
    model : Keras model obj, required
        Keras model object ready for running predictions
    ext : str, optional
        Extension of frames files to search for, by default '.allframes.npy'
    target_ext : str, optional
        Target extension for feature vectors to be serialized to, by default '.proc.c3d-avg.npy'
    classes : dict, optional
        Dictionary with class names and numeral representations. Must match the folder names in 'data_dir'.
        If set to 'None' it will automatically figure out classes based on folders structure in 'data_dir'.
        Example: {'Class1': 1, 'Class2': 2}
        Defaults to 'None'
    frames_per_clip : int, optional
        When extracting smaller clips from longer video this defines the number of frames cut out from longer clip, by default 16
    frames_step : int, optional
        When extracting smaller clips from longer video this defines the step in number of frames, by default 8
    batch_size : int, optional
        Mini batch size used when pushing data to the model for scoring, by default 32
    max_elements : int, optional
        Max elements for silly progress indicator, by default 13320
    """

    (mp.get_datastream(data_dir, classes=classes, ext=ext)
     # load all frames for each video file
     | mp.apply('filename',
                'allframes',
                lambda fn: np.load(fn),
                eval_strategy=mp.EvalStrategies.OnDemand)
     # cut each video into multiple shorter clips definded by frames_per_clip and frames_step parameters
     | mp.apply('allframes',
                'clips16-8',
                lambda v: extract_clips(
                    v, frames_per_clip=frames_per_clip, step=frames_step),
                eval_strategy=mp.EvalStrategies.OnDemand)
     # center crop frames into 112x112
     | mp.apply('clips16-8',
                'cropped16-8',
                lambda v: np.asarray([[crop_center(frame) for frame in clip]
                                      for clip in v]),
                eval_strategy=mp.EvalStrategies.OnDemand)
     # preprocess frames by substracting channel-wise mean
     | mp.apply('cropped16-8',
                'proc_cropped16-8',
                lambda v: preprocess_input(v, mean_std, divide_std=False),
                eval_strategy=mp.EvalStrategies.OnDemand)
     # run batch predictions on c3d model to get feature vectors for each clip
     | mp.apply_batch('proc_cropped16-8',
                      'c3d16-8',
                      lambda x: predict_c3d(x, model),
                      batch_size=batch_size)
     # for each full video take feature vectors for all the extracted clips and average them
     | mp.apply('c3d16-8',
                'c3d_avg',
                lambda v: np.average(v, axis=0),
                eval_strategy=mp.EvalStrategies.OnDemand)
     # draw silly progress
     | mp.silly_progress(elements=max_elements)
     # save averaged feature vectors into .npy files
     | cachecomputex(
         ext, target_ext, lambda x, nfn: np.save(nfn, x['c3d_avg']), lambda x,
         nfn: print("Skipping saving 'c3d_avg' {}".format(x['filename'])))
     | mp.execute)
예제 #13
0

def fint(x):
    return int(float(x))


@mp.Pipe
def import_fields(seq, field_name):
    for x in seq:
        try:
            for k in x[field_name].keys():
                x[k] = x[field_name][k]
            yield x
        except:
            print("WARN: object field is {}".format(x[field_name]))
            pass


(mp.get_pascal_annotations(os.path.join(data_dir, "Annotations"))
 #| mp.take(100)
 | mp.unroll("object")
 | mp.apply("filename", "fname",
            lambda x: os.path.join(data_dir, "JPEGImages", x))
 | import_fields('object')
 | mp.apply('bndbox_xmin', 'xmin', fint)
 | mp.apply('bndbox_xmax', 'xmax', fint)
 | mp.apply('bndbox_ymin', 'ymin', fint)
 | mp.apply('bndbox_ymax', 'ymax', fint)
 | mp.select_fields(['fname', 'xmin', 'ymin', 'xmax', 'ymax', 'name'])
 | mp.write_csv('annotations.csv', write_headers=False))
예제 #14
0
from moviepy.editor import *
import numpy as np
import itertools
import cv2
import math
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential
from keras.layers import *
from keras.regularizers import l2

# Get list of test videos from matches.json
test_names = (from_json(os.path.join(source_dir, 'matches.json'))
              | mp.where(lambda x: 'Test' in x.keys() and int(x['Test']) > 0)
              | mp.apply(['Id', 'Half'], 'pattern',
                         lambda x: "{}_{}_".format(x[0], x[1]))
              | mp.select_field('pattern')
              | mp.as_list)

data = (mp.get_datastream(data_dir, ext=".resized.mp4")
        | datasplit_by_pattern(test_pattern=test_names)
        | stratify_sample_tt(shuffle=True)
        | summary()
        | mp.take(1000)
        | mp.iter('filename', lambda x: print("Processing {}".format(x)))
        | mp.apply(
            'filename', 'aud',
            lambda x: np.load(x.replace('.resized.mp4', '.audiofeatures.npy')))
        | normalize_npy_value('aud', interval=(-1, 1))
        | mp.as_list)
예제 #15
0
파일: simple.py 프로젝트: vJenny/mPyPl
# mPyPl - Monadic Pipeline Library for Python
# http://github.com/shwars/mPyPl

# Simple samples

sys.path.append('z:\\GitWork\mPyPl')

import mPyPl as mp
from pipe import *
from mPyPl.utils.pipeutils import *

range(100) | mp.as_field('n') | mp.apply(
    'n', 'n5', lambda x: x % 5) | mp.dict_group_by('n5')

data = range(100) | mp.as_field('n') | mp.apply(
    'n', 'class_id', lambda x: x % 5) | mp.datasplit(split_value=0.2)
Tr, Te = data | mp.make_train_test_split()
len(Tr | as_list)
len(Te | as_list)

data = range(100) | mp.as_field('n') | mp.apply('n', 'class_id',
                                                lambda x: x % 5) | pshuffle

data | mp.sample_classes('class_id', 1, classes=range(15)) | as_list

x = mp.get_xmlstream_fromdir('e:\\data\\babylon\\')
예제 #16
0
# mPyPl - Monadic Pipeline Library for Python
# http://github.com/shwars/mPyPl

# Simple samples

sys.path.insert(0, 'z:\\GitWork\mPyPl')

import mPyPl as mp
from pipe import *
from mPyPl.utils.pipeutils import *
import cv2
import mPyPl.utils.image as mpui

print("Using mPyPl version " + mp.__version__)

range(100) | mp.as_field('n') | mp.apply(
    'n', 'n5', lambda x: x % 5) | mp.dict_group_by('n5')

data = range(100) | mp.as_field('n') | mp.apply(
    'n', 'class_id', lambda x: x % 5) | mp.datasplit(split_value=0.2)
Tr, Te = data | mp.make_train_test_split()
len(Tr | as_list)
len(Te | as_list)

data = range(100) | mp.as_field('n') | mp.apply('n', 'class_id',
                                                lambda x: x % 5) | pshuffle

data | mp.sample_classes('class_id', 1, classes=range(15)) | as_list

x = mp.get_xmlstream_fromdir('e:\\data\\babylon\\')

# Compute sum of integers from 1 to 100. Should return 5050
예제 #17
0
import functools as fn
import keras

print(mp.__version__)

train_dir = os.path.join(base_dir, 'training_set')
test_dir = os.path.join(base_dir, 'test_set')

classes = mp.get_classes(train_dir)
# we need to explicitly get classes in order to have the same correspondence of class and int for train and test set

# Show first few images from the training set
seq = (mp.get_datastream(train_dir, classes=classes)
       | take(10)
       | mp.apply(
           'filename', 'image',
           lambda fn: mpui.im_resize_pad(cv2.imread(fn), size=(100, 100)))
       | mp.select_field('image')
       | pexec(fn.partial(mpui.show_images, cols=2)))

transform = keras.preprocessing.image.ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

scale_transform = keras.preprocessing.image.ImageDataGenerator(rescale=1. /
                                                               255)
예제 #18
0
print("Face Dataset Generator")

print(" + Loading descriptions from {}".format(dir))


def loadjs(fn):
    with open(fn) as f:
        return json.load(f)


min_size = size / 3 if args.ignore_small else 0
max_faces_no = 2 if args.ignore_multiface else 99999

data = (mp.get_files(dir, ext='.json')
        | mp.as_field('filename')
        | mp.apply('filename', 'descr', loadjs)
        | mp.filter('descr', lambda x: len(x) > 0 and len(x) < max_faces_no)
        | mp.unroll('descr')
        | mp.filter(
            'descr', lambda x: abs(x['faceAttributes']['headPose']['yaw']) < 15
            and abs(x['faceAttributes']['headPose']['pitch']) < 15)
        | mp.filter(
            'descr', lambda x: x['faceLandmarks']['pupilRight']['x'] - x[
                'faceLandmarks']['pupilLeft']['x'] > min_size)
        | mp.as_list)

print(" + Found {} faces".format(len(data)))

print(" + Storing dataset...")

예제 #19
0
    print(f"Processing {match['Video']} ({match['Id']})\n")
    videoFilePath = os.path.join(source_dir,match["Video"])
    marksFilePath = os.path.join(source_dir,"Marks.jsonl")
    video = VideoFileClip(videoFilePath)

    data = []
    with open(marksFilePath,'r') as f:
        for cnt, line in enumerate(f):
            data.append(json.loads(line))

    # Get list of times when match halves start
    mt = data | mp.filter('eventType', lambda x: "start" in x) | mp.select_field('matchTime') | mp.as_list
    mt = convtime(mt[h])

    # Shots are fine, added also Goal which are not marked as shots
    cuts = data | mp.filter('eventType', shotFilter) | mp.apply('matchTime', 'start', convtime)

    # Consider, for negative examples, also Attacks, which are Shots nearby the goal area (or in other places of the field)
    # They are also filtered (later), removing those overlapping with shots/goals
    attacks = data | mp.filter('eventType', attackFilter) | mp.apply('matchTime', 'start', convtime)

    clipTrimTime = float(args.clipTrimTime)                       # +/- X seconds centered on event time

    clipTrimAttackTimeRange = float(args.clipTrimAttackTimeRange) # +/- X seconds centered on event time
    clipTrimAttackStart = float(args.clipTrimAttackStart)         # when attack clip starts (before event time)
    clipTrimAttackEnd = float(args.clipTrimAttackEnd)             # when attack clip ends (after event time)

    noShotClipDuration = float(args.noShotClipDuration)
    noShotClipInterval = float(args.noShotClipInterval)

    n=0
예제 #20
0
print(" + Loading images from {}".format(dir))

data = (
    mp.get_files(dir)
    | mp.as_field('filename')
    | mp.apply_nx('filename','image',lambda x: cv2.imread(x),print_exceptions=False)
    | mp.filter('image',lambda x: x is not None)
    | mp.as_list)

print(" + Found {} images".format(len(data)))

print(" + Extracting facial landmarks...")

data = (
    data 
    | mp.apply(['filename','image'],'landmarks',detect)
    | mp.filter('landmarks',lambda x: x!=[])
    | mp.as_list
)

if not args.nocache:
    print(" + Saving cache...")
    with open(os.path.join(script_dir,'cache.pkl'),'wb') as f:
        pickle.dump(cache,f)

def transform(args):
    image,f = args
    mc_x = (f['mouthLeft']['x']+f['mouthRight']['x'])/2.0
    mc_y = (f['mouthLeft']['y'] + f['mouthRight']['y']) / 2.0
    tr = cv2.getAffineTransform(np.float32([(f['pupilLeft']['x'],f['pupilLeft']['y']),(f['pupilRight']['x'],f['pupilRight']['y']),(mc_x,mc_y)]),
                                np.float32(target_triangle))
def plot_flow_descriptor(fd, step=5):
    fig, ax = plt.subplots(5, 2)
    for i in range(5):
        ax[i, 0].plot(fd[i * step, :, 0])
        ax[i, 1].plot(fd[i * step, :, 1])
    plt.show()


# Plot to see how it works
# fd = get_flow_descriptor(flow)
# plot_flow_descriptor(np.log(fd))

# Get list of test videos from matches.json
test_names = (from_json(os.path.join(source_dir, 'matches.json'))
              | mp.where(lambda x: 'Test' in x.keys() and int(x['Test']) > 0)
              | mp.apply(['Id', 'Half'], 'pattern',
                         lambda x: "{}_{}_".format(x[0], x[1]))
              | mp.select_field('pattern')
              | mp.as_list)

data = (mp.get_datastream(data_dir, ext=".resized.mp4")
        | datasplit_by_pattern(test_pattern=test_names)
        | stratify_sample_tt(shuffle=True)
        | summary()
        | mp.iter('filename', lambda x: print("Processing {}".format(x)))
        | mp.apply(
            'filename',
            'dflow',
            lambda x: np.load(x.replace('.resized.mp4', '.optflow.npy')),
            eval_strategy=mp.EvalStrategies.LazyMemoized)
        | mp.apply_npy('dflow', 'flowdescr', get_flow_descriptor)
        | mp.delfield('dflow')
    fx.write_videofile(nfn)
    clip.close()

def load_resize(x):
    fn = x['filename']
    nfn = fn.replace('.full.mp4','.resized.mp4')
    x['filename'] = nfn
    if os.path.isfile(nfn):
        print("Loading resized {}".format(nfn))
        vc = VideoFileClip(nfn)
        return vc
    else:
        print("Resizing {}".format(fn))
        vc = VideoFileClip(fn).fx(vfx.resize, width=video_width)
        vc.write_videofile(nfn)
        return vc
    
def resize(x):
    v = VideoFileClip(x)
    vfxc = v.fx(vfx.resize, width=video_width)
    return (v, vfxc)

if __name__ == "__main__":
    (mp.get_datastream(data_dir,ext=".full.mp4")
     | where( lambda f: not os.path.isfile( f['filename'].replace(".full.mp4",".resized.mp4") ) )
     | mp.apply('filename','video', resize )
     | cachecomputex(".full.mp4",".resized.mp4",resize_save,lambda x,nx: print("Skipping {}".format(x['filename'])))
     | execute
    )